From dc2929b416f7eabb0fc123599b4391efa69cd40a Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 5 Aug 2024 23:34:56 -0600 Subject: [PATCH] Work around kokkos issue 7036 sort_by_key on host backends, with nvcc as compiler would produce build errors. The fix is present in Kokkos >= 40400 (4.4.0 and up, including the current develop branch). So for older versions, if building for CUDA then disable bulk sorting code paths that use sort_by_key. --- sparse/impl/KokkosSparse_sort_crs_impl.hpp | 26 ++++++++++----- sparse/src/KokkosSparse_SortCrs.hpp | 39 +++++++++++++++++++--- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/sparse/impl/KokkosSparse_sort_crs_impl.hpp index 54362d31a3..ede1a839e7 100644 --- a/sparse/impl/KokkosSparse_sort_crs_impl.hpp +++ b/sparse/impl/KokkosSparse_sort_crs_impl.hpp @@ -19,6 +19,14 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Sort.hpp" +// Workaround for issue with Kokkos::Experimental::sort_by_key, with nvcc and OpenMP enabled +// (Kokkos issue #7036, fixed in 4.4 release) +// Once support for Kokkos < 4.4 is dropped, +// all code inside "ifdef KK_DISABLE_BULK_SORT_BY_KEY" can be deleted. +#if (KOKKOS_VERSION < 40400) && defined(KOKKOS_ENABLE_CUDA) +#define KK_DISABLE_BULK_SORT_BY_KEY +#endif + namespace KokkosSparse { namespace Impl { @@ -244,6 +252,7 @@ Kokkos::View generateBulkCrsKeys(const ExecSpace& exec, co return keys; } +#ifndef KK_DISABLE_BULK_SORT_BY_KEY template Kokkos::View computeEntryPermutation( const ExecSpace& exec, const Rowmap& rowmap, const Entries& entries, typename Entries::non_const_value_type ncols) { @@ -258,6 +267,15 @@ Kokkos::View computeEntryPerm return permutation; } +// Heuristic for choosing bulk sorting algorithm +template +bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { + // Use bulk sort if matrix is highly imbalanced, + // OR the longest rows have many entries. + return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); +} +#endif + template void applyPermutation(const ExecSpace& exec, const Permutation& permutation, const InView& in, const OutView& out) { Kokkos::parallel_for( @@ -281,14 +299,6 @@ void applyPermutationBlockValues(const ExecSpace& exec, const Permutation& permu }); } -// Heuristic for choosing bulk sorting algorithm -template -bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { - // Use bulk sort if matrix is highly imbalanced, - // OR the longest rows have many entries. - return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); -} - } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 89cad01127..177709bbaf 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -90,7 +90,8 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const // If the matrix is highly imbalanced, or has long rows AND the dimensions // are not too large to do one large bulk sort, do that. Otherwise, sort // using one Kokkos thread per row. - Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; +#ifndef KK_DISABLE_BULK_SORT_BY_KEY Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); bool useBulkSort = false; if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { @@ -113,7 +114,11 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const Kokkos::deep_copy(exec, origEntries, entries); KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, entries); KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, values); - } else { + } else +#else + (void)numCols; +#endif + { using TeamPol = Kokkos::TeamPolicy; // Can't use bulk sort approach as matrix dimensions are too large. // Fall back to parallel thread-level sort within each row. @@ -179,7 +184,28 @@ void sort_bsr_matrix(const execution_space& exec, Ordinal blockSize, const rowma throw std::invalid_argument( "sort_bsr_matrix: implementation requires that numRows * numCols is " "representable in uint64_t"); +#ifdef KK_DISABLE_BULK_SORT_BY_KEY + using TeamPol = Kokkos::TeamPolicy; + using Offset = typename rowmap_t::non_const_value_type; + // Temporary workaround: do not use Kokkos::Experimental::sort_by_key, instead + // sort bulk keys one row at a time + auto keys = Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols); + Kokkos::View permutation(Kokkos::view_alloc(Kokkos::WithoutInitializing, "permutation"), + entries.extent(0)); + Ordinal vectorLength = 1; + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (vectorLength < avgDeg / 2) { + vectorLength *= 2; + } + if (vectorLength > TeamPol ::vector_length_max()) vectorLength = TeamPol ::vector_length_max(); + Impl::MatrixSortThreadFunctor funct( + numRows, rowmap, entries, permutation); + Ordinal teamSize = TeamPol(exec, 1, 1, vectorLength).team_size_recommended(funct, Kokkos::ParallelForTag()); + Kokkos::parallel_for("sort_bulk_keys_by_row[GPU,bitonic]", + TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); +#else auto permutation = KokkosSparse::Impl::computeEntryPermutation(exec, rowmap, entries, numCols); +#endif // Permutations cannot be done in-place Kokkos::View origValues( Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), values.extent(0)); @@ -254,7 +280,8 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e // If the graph is highly imbalanced AND the dimensions are not too large // to do one large bulk sort, do that. Otherwise, sort using one Kokkos // thread per row. - Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; +#ifndef KK_DISABLE_BULK_SORT_BY_KEY Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); bool useBulkSort = false; if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { @@ -269,7 +296,11 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e if (useBulkSort) { auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, numCols); Kokkos::Experimental::sort_by_key(exec, keys, entries); - } else { + } else +#else + (void)numCols; +#endif + { using TeamPol = Kokkos::TeamPolicy; // Fall back to thread-level sort within each row Ordinal vectorLength = 1;