Skip to content

Commit

Permalink
Optimize 3x3 blocks (#518)
Browse files Browse the repository at this point in the history
  • Loading branch information
blegouix authored Jul 15, 2024
1 parent 30215e4 commit 0da9a03
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 52 deletions.
11 changes: 6 additions & 5 deletions include/ddc/kernels/splines/spline_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,17 +300,18 @@ class SplineBuilder
/**
* @brief Get the whole domain on which spline coefficients are defined, with the dimension of interest being the leading dimension.
*
* This is used internally due to solver limitation and because it may be beneficial to computation performance.
* This is used internally due to solver limitation and because it may be beneficial to computation performance. For LAPACK backend and non-periodic boundary condition, we are using SplinesLinearSolver3x3Blocks which requires upper_block_size additional rows for internal operations.
*
* @return The (transposed) domain for the spline coefficients.
*/
batched_spline_tr_domain_type batched_spline_tr_domain() const noexcept
{
return batched_spline_tr_domain_type(
batched_spline_domain().restrict(ddc::DiscreteDomain<bsplines_type>(
return batched_spline_tr_domain_type(ddc::replace_dim_of<bsplines_type, bsplines_type>(
batched_spline_domain(),
ddc::DiscreteDomain<bsplines_type>(
ddc::DiscreteElement<bsplines_type>(0),
ddc::DiscreteVector<bsplines_type>(
ddc::discrete_space<bsplines_type>().nbasis()))));
matrix->required_number_of_rhs_rows()))));
}

public:
Expand Down Expand Up @@ -846,7 +847,7 @@ operator()(
// Create a 2D Kokkos::View to manage spline_tr as a matrix
Kokkos::View<double**, Kokkos::LayoutRight, exec_space> bcoef_section(
spline_tr.data_handle(),
ddc::discrete_space<bsplines_type>().nbasis(),
static_cast<std::size_t>(spline_tr.template extent<bsplines_type>()),
batch_domain().size());
// Compute spline coef
matrix->solve(bcoef_section);
Expand Down
20 changes: 20 additions & 0 deletions include/ddc/kernels/splines/splines_linear_problem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,26 @@ class SplinesLinearProblem
{
return m_size;
}

/**
* @brief Get the required number of rows of the multi-rhs view passed to solve().
*
* Implementations may require a number of rows larger than what `size` returns for optimization purposes.
*
* @return The required number of rows of the multi-rhs view. It is guaranteed to be greater or equal to `size`.
*/
std::size_t required_number_of_rhs_rows() const
{
std::size_t const nrows = impl_required_number_of_rhs_rows();
assert(nrows >= size());
return nrows;
}

private:
virtual std::size_t impl_required_number_of_rhs_rows() const
{
return m_size;
}
};

/**
Expand Down
99 changes: 66 additions & 33 deletions include/ddc/kernels/splines/splines_linear_problem_3x3_blocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,71 +96,89 @@ class SplinesLinearProblem3x3Blocks : public SplinesLinearProblem2x2Blocks<ExecS
* @brief Perform row interchanges on multiple right-hand sides to get a 2-blocks structure (matching the requirements
* of the SplinesLinearProblem2x2Blocks solver).
*
* | b_top | | b_center |
* | b_center | -> | b_top | -- Considered as a
* | b_bottom | | b_bottom | -- single bottom block
* | b_top | | - |
* | b_center | -> | b_center |
* | b_bottom | | b_top | -- Considered as a
* | - | | b_bottom | -- single bottom block
*
* @param b The multiple right-hand sides.
*/
void interchange_rows_from_3_to_2_blocks_rhs(MultiRHS const b) const
{
std::size_t const nq = m_top_left_block->size(); // size of the center block

// prevent Kokkos::deep_copy(b_top_dst, b_top) to be a deep_copy between overlapping allocations
assert(nq >= m_top_size);

MultiRHS const b_top = Kokkos::
subview(b, std::pair<std::size_t, std::size_t> {0, m_top_size}, Kokkos::ALL);
MultiRHS const b_center = Kokkos::
MultiRHS const b_bottom = Kokkos::
subview(b,
std::pair<std::size_t, std::size_t> {m_top_size, m_top_size + nq},
std::pair<std::size_t, std::size_t> {m_top_size + nq, size()},
Kokkos::ALL);

MultiRHS const b_center_dst
= Kokkos::subview(b, std::pair<std::size_t, std::size_t> {0, nq}, Kokkos::ALL);
MultiRHS const b_top_dst = Kokkos::
subview(b, std::pair<std::size_t, std::size_t> {nq, nq + m_top_size}, Kokkos::ALL);
subview(b,
std::pair<std::size_t, std::size_t> {m_top_size + nq, 2 * m_top_size + nq},
Kokkos::ALL);
MultiRHS const b_bottom_dst = Kokkos::
subview(b,
std::pair<
std::size_t,
std::size_t> {2 * m_top_size + nq, m_top_size + size()},
Kokkos::ALL);

MultiRHS const buffer = Kokkos::create_mirror(ExecSpace(), b_center);
if (b_bottom.extent(0) > b_top.extent(0)) {
// Need a buffer to prevent overlapping
MultiRHS const buffer = Kokkos::create_mirror(ExecSpace(), b_bottom);

Kokkos::deep_copy(buffer, b_center);
Kokkos::deep_copy(buffer, b_bottom);
Kokkos::deep_copy(b_bottom_dst, buffer);
} else {
Kokkos::deep_copy(b_bottom_dst, b_bottom);
}
Kokkos::deep_copy(b_top_dst, b_top);
Kokkos::deep_copy(b_center_dst, buffer);
}

/**
* @brief Perform row interchanges on multiple right-hand sides to restore its 3-blocks structure.
*
* | b_center | | b_top |
* | b_top | -> | b_center |
* | b_bottom | | b_bottom |
* | - | | b_top |
* | b_center | -> | b_center |
* | b_top | | b_bottom |
* | b_bottom | | - |
*
* @param b The multiple right-hand sides.
*/
void interchange_rows_from_2_to_3_blocks_rhs(MultiRHS const b) const
{
std::size_t const nq = m_top_left_block->size(); // size of the center block

// prevent Kokkos::deep_copy(b_top, b_top_src) to be a deep_copy between overlapping allocations
assert(nq >= m_top_size);

MultiRHS const b_center_src
= Kokkos::subview(b, std::pair<std::size_t, std::size_t> {0, nq}, Kokkos::ALL);
MultiRHS const b_top_src = Kokkos::
subview(b, std::pair<std::size_t, std::size_t> {nq, nq + m_top_size}, Kokkos::ALL);

MultiRHS const b_top = Kokkos::
subview(b, std::pair<std::size_t, std::size_t> {0, m_top_size}, Kokkos::ALL);
MultiRHS const b_center = Kokkos::
MultiRHS const b_bottom = Kokkos::
subview(b,
std::pair<std::size_t, std::size_t> {m_top_size, m_top_size + nq},
std::pair<std::size_t, std::size_t> {m_top_size + nq, size()},
Kokkos::ALL);

MultiRHS const buffer = Kokkos::create_mirror(ExecSpace(), b_center);
MultiRHS const b_top_src = Kokkos::
subview(b,
std::pair<std::size_t, std::size_t> {m_top_size + nq, 2 * m_top_size + nq},
Kokkos::ALL);
MultiRHS const b_bottom_src = Kokkos::
subview(b,
std::pair<
std::size_t,
std::size_t> {2 * m_top_size + nq, m_top_size + size()},
Kokkos::ALL);

Kokkos::deep_copy(buffer, b_center_src);
Kokkos::deep_copy(b_top, b_top_src);
Kokkos::deep_copy(b_center, buffer);
if (b_bottom.extent(0) > b_top.extent(0)) {
// Need a buffer to prevent overlapping
MultiRHS const buffer = Kokkos::create_mirror(ExecSpace(), b_bottom);

Kokkos::deep_copy(buffer, b_bottom_src);
Kokkos::deep_copy(b_bottom, buffer);
} else {
Kokkos::deep_copy(b_bottom, b_bottom_src);
}
}

public:
Expand All @@ -169,17 +187,32 @@ class SplinesLinearProblem3x3Blocks : public SplinesLinearProblem2x2Blocks<ExecS
*
* Perform row interchanges on multiple right-hand sides to obtain a 2x2-blocks linear problem and call the SplinesLinearProblem2x2Blocks solver.
*
* @param[in, out] b A 2D Kokkos::View storing the multiple right-hand sides of the problem and receiving the corresponding solution.
* This class requires an additional allocation corresponding to top_size rows for internal operation.
*
* @param[in, out] b A 2D Kokkos::View storing the multiple right-hand sides (+ additional garbage allocation) of the problem and receiving the corresponding solution.
* @param transpose Choose between the direct or transposed version of the linear problem.
*/
void solve(MultiRHS const b, bool const transpose) const override
{
assert(b.extent(0) == size());
assert(b.extent(0) == size() + m_top_size);

interchange_rows_from_3_to_2_blocks_rhs(b);
SplinesLinearProblem2x2Blocks<ExecSpace>::solve(b, transpose);
SplinesLinearProblem2x2Blocks<ExecSpace>::
solve(Kokkos::
subview(b,
std::pair<
std::size_t,
std::size_t> {m_top_size, m_top_size + size()},
Kokkos::ALL),
transpose);
interchange_rows_from_2_to_3_blocks_rhs(b);
}

private:
std::size_t impl_required_number_of_rhs_rows() const override
{
return size() + m_top_size;
}
};

} // namespace ddc::detail
43 changes: 29 additions & 14 deletions tests/splines/splines_linear_problem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(MATRIX_CPP)
void fill_identity(
ddc::detail::SplinesLinearProblem<Kokkos::DefaultHostExecutionSpace>::MultiRHS mat)
{
assert(mat.extent(0) == mat.extent(1));
for (std::size_t i(0); i < mat.extent(0); ++i) {
for (std::size_t j(0); j < mat.extent(1); ++j) {
mat(i, j) = int(i == j);
Expand Down Expand Up @@ -93,33 +92,45 @@ namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(MATRIX_CPP)

splines_linear_problem.setup_solver();

Kokkos::DualView<double*> inv_ptr("inv_ptr", N * N);
Kokkos::DualView<double*>
inv_ptr("inv_ptr", splines_linear_problem.required_number_of_rhs_rows() * N);
ddc::detail::SplinesLinearProblem<Kokkos::DefaultHostExecutionSpace>::MultiRHS
inv(inv_ptr.h_view.data(), N, N);
inv(inv_ptr.h_view.data(), splines_linear_problem.required_number_of_rhs_rows(), N);
fill_identity(inv);
inv_ptr.modify_host();
inv_ptr.sync_device();
splines_linear_problem.solve(
ddc::detail::SplinesLinearProblem<
Kokkos::DefaultExecutionSpace>::MultiRHS(inv_ptr.d_view.data(), N, N));
ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>::MultiRHS(
inv_ptr.d_view.data(),
splines_linear_problem.required_number_of_rhs_rows(),
N));
inv_ptr.modify_device();
inv_ptr.sync_host();

Kokkos::DualView<double*> inv_tr_ptr("inv_tr_ptr", N * N);
Kokkos::DualView<double*>
inv_tr_ptr("inv_tr_ptr", splines_linear_problem.required_number_of_rhs_rows() * N);
ddc::detail::SplinesLinearProblem<Kokkos::DefaultHostExecutionSpace>::MultiRHS
inv_tr(inv_tr_ptr.h_view.data(), N, N);
inv_tr(inv_tr_ptr.h_view.data(),
splines_linear_problem.required_number_of_rhs_rows(),
N);
fill_identity(inv_tr);
inv_tr_ptr.modify_host();
inv_tr_ptr.sync_device();
splines_linear_problem
.solve(ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>::
MultiRHS(inv_tr_ptr.d_view.data(), N, N),
.solve(ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>::MultiRHS(
inv_tr_ptr.d_view.data(),
splines_linear_problem.required_number_of_rhs_rows(),
N),
true);
inv_tr_ptr.modify_device();
inv_tr_ptr.sync_host();

check_inverse(val, inv);
check_inverse_transpose(val, inv_tr);
check_inverse(
val,
Kokkos::subview(inv, std::pair<std::size_t, std::size_t> {0, N}, Kokkos::ALL));
check_inverse_transpose(
val,
Kokkos::subview(inv_tr, std::pair<std::size_t, std::size_t> {0, N}, Kokkos::ALL));
}

} // namespace )
Expand Down Expand Up @@ -255,12 +266,13 @@ TEST(SplinesLinearProblem, 3x3Blocks)
{
std::size_t const N = 10;
std::size_t const k = 10;
std::size_t const top_size = 2;
std::unique_ptr<ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>> center_block
= std::make_unique<
ddc::detail::SplinesLinearProblemDense<Kokkos::DefaultExecutionSpace>>(N - 5);
std::unique_ptr<ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>>
splines_linear_problem = std::make_unique<ddc::detail::SplinesLinearProblem3x3Blocks<
Kokkos::DefaultExecutionSpace>>(N, 2, std::move(center_block));
Kokkos::DefaultExecutionSpace>>(N, top_size, std::move(center_block));

// Build a non-symmetric full-rank matrix (without zero)
for (std::size_t i(0); i < N; ++i) {
Expand Down Expand Up @@ -350,10 +362,11 @@ TEST_P(SplinesLinearProblemSizesFixture, OffsetBanded)
TEST_P(SplinesLinearProblemSizesFixture, 2x2Blocks)
{
auto const [N, k] = GetParam();
std::size_t const bottom_size = 3;
std::unique_ptr<ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>>
splines_linear_problem
= ddc::detail::SplinesLinearProblemMaker::make_new_block_matrix_with_band_main_block<
Kokkos::DefaultExecutionSpace>(N, k, k, false, 3);
Kokkos::DefaultExecutionSpace>(N, k, k, false, bottom_size);

// Build a non-symmetric full-rank band matrix
for (std::size_t i(0); i < N; ++i) {
Expand All @@ -372,10 +385,12 @@ TEST_P(SplinesLinearProblemSizesFixture, 2x2Blocks)
TEST_P(SplinesLinearProblemSizesFixture, 3x3Blocks)
{
auto const [N, k] = GetParam();
std::size_t const bottom_size = 3;
std::size_t const top_size = 2;
std::unique_ptr<ddc::detail::SplinesLinearProblem<Kokkos::DefaultExecutionSpace>>
splines_linear_problem
= ddc::detail::SplinesLinearProblemMaker::make_new_block_matrix_with_band_main_block<
Kokkos::DefaultExecutionSpace>(N, k, k, false, 3, 2);
Kokkos::DefaultExecutionSpace>(N, k, k, false, bottom_size, top_size);

// Build a non-symmetric full-rank band matrix
for (std::size_t i(0); i < N; ++i) {
Expand Down

0 comments on commit 0da9a03

Please sign in to comment.