Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Made a few optimizations for running OpenTurbine on GPU. #350

Merged
merged 4 commits into from
Feb 26, 2025

Conversation

ddement
Copy link
Collaborator

@ddement ddement commented Feb 7, 2025

  1. Removed deep copies from system setup code, saving a lot of time for large problems
  2. Changed to using KLU2 even on GPU, which improved the 3000 blade case from 32s to 16s. Also improved 3 blade case solve time by >50%.
  3. Merged loops to increase parallelism in system matrix contribution.

1. Removed deep copies from system setup code, saving a lot of time for large problems
2. Changed to using KLU2 even on GPU, which improved the 3000 blade case from 32s to 16s.
   Also improved 3 blade case solve time by >50%.
3. Merged loops to increase parallelism in system matrix contribution.
@ddement ddement self-assigned this Feb 7, 2025
@ddement ddement marked this pull request as ready for review February 7, 2025 23:07
Base automatically changed from restart_from_file to main February 8, 2025 03:46
@faisal-bhuiyan faisal-bhuiyan added the code-efficiency Related to making code more efficient label Feb 13, 2025
Comment on lines 10 to 19
inline void PopulateNodeX0(
const BeamElement& elem, const std::vector<Node>& nodes,
const Kokkos::View<double* [7], Kokkos::LayoutStride, Kokkos::HostSpace>& node_x0
) {
// Map node positions from [0,1] to [-1,1]
std::vector<double> node_xi(elem.node_ids.size());
for (size_t i = 0; i < elem.node_ids.size(); ++i) {
node_xi[i] = 2 * nodes[elem.node_ids[i]].s - 1;
}

// Populate node initial position and orientation
for (size_t j = 0; j < elem.node_ids.size(); ++j) {
for (size_t k = 0U; k < kLieGroupComponents; ++k) {
node_x0(j, k) = nodes[elem.node_ids[j]].x[k];
}
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose the following readability improvement:

/// @brief Populate the node initial position and orientation
inline void PopulateNodeX0(
    const BeamElement& elem, const std::vector<Node>& nodes,
    const Kokkos::View<double* [7], Kokkos::LayoutStride, Kokkos::HostSpace>& node_x0
) {
    for (size_t j = 0; j < elem.node_ids.size(); ++j) {
        for (size_t k = 0U; k < 7; ++k) {
            node_x0(j, k) = nodes[elem.node_ids[j]].x[k];
        }
    }
}


// Populate quadrature weights
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep the comment like so

/// @brief Populate the quadrature weights


LagrangePolynomialDerivWeights(qp_xi, node_xi, weights);
for (size_t k = 0; k < node_xi.size(); ++k) {
shape_deriv(k, j) = weights[k];
}
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could remove the duplicate code here (Lines 30-68) by doing the following. There might be more optimization possible by combining the shape + shape deriv functions into one however I suppose you wanted to keep them separate.

/// @brief Map node positions from [0,1] to [-1,1]
inline std::vector<double> MapNodePositions(
    const BeamElement& elem, const std::vector<Node>& nodes
) {
    std::vector<double> node_xi(elem.node_ids.size());
    for (size_t i = 0; i < elem.node_ids.size(); ++i) {
        node_xi[i] = 2 * nodes[elem.node_ids[i]].s - 1;
    }
    return node_xi;
}

/// @brief Populate the shape function values
inline void PopulateShapeFunctionValues(
    const BeamElement& elem, const std::vector<Node>& nodes,
    const Kokkos::View<double**, Kokkos::LayoutStride, Kokkos::HostSpace>& shape_interp
) {
    const auto node_xi = MapNodePositions(elem, nodes);

    // Populate shape interpolation values (Lagrange polynomials)
    std::vector<double> weights;
    for (size_t j = 0; j < elem.quadrature.size(); ++j) {
        auto qp_xi = elem.quadrature[j][0];
        LagrangePolynomialInterpWeights(qp_xi, node_xi, weights);
        for (size_t k = 0; k < node_xi.size(); ++k) {
            shape_interp(k, j) = weights[k];
        }
    }
}

/// @brief Populate the shape function derivatives
inline void PopulateShapeFunctionDerivatives(
    const BeamElement& elem, const std::vector<Node>& nodes,
    const Kokkos::View<double**, Kokkos::LayoutStride, Kokkos::HostSpace>& shape_deriv
) {
    const auto node_xi = MapNodePositions(elem, nodes);

    // Populate shape function derivative values (Lagrange polynomials)
    std::vector<double> weights;
    for (size_t j = 0; j < elem.quadrature.size(); ++j) {
        auto qp_xi = elem.quadrature[j][0];
        LagrangePolynomialDerivWeights(qp_xi, node_xi, weights);
        for (size_t k = 0; k < node_xi.size(); ++k) {
            shape_deriv(k, j) = weights[k];
        }
    }
}

Comment on lines 98 to 124
inline void PopulateQPCstar(
const BeamElement& elem,
const Kokkos::View<double* [6][6], Kokkos::LayoutStride, Kokkos::HostSpace>& qp_Cstar
) {
std::vector<double> weights(elem.sections.size());
std::vector<double> section_xi(elem.sections.size());
for (size_t i = 0; i < elem.sections.size(); ++i) {
section_xi[i] = 2 * elem.sections[i].position - 1;
}

for (size_t i = 0; i < elem.quadrature.size(); ++i) {
auto qp_xi = elem.quadrature[i][0];
LinearInterpWeights(qp_xi, section_xi, weights);
for (size_t m = 0; m < 6; ++m) {
for (size_t n = 0; n < 6; ++n) {
qp_Cstar(i, m, n) = 0.;
}
}
for (size_t j = 0; j < section_xi.size(); ++j) {
for (size_t m = 0; m < 6; ++m) {
for (size_t n = 0; n < 6; ++n) {
qp_Cstar(i, m, n) += elem.sections[j].C_star[m][n] * weights[j];
}
}
}
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a similar vein to my previous comment, we could modularize as follows to remove redundant code in M* and C* population. And I wonder if we could write a lambda-like function to consolidate the sectional matrix creation logic since they look awfully similar -- I leave that to your judgement.

/// @brief Map section positions from [0,1] to [-1,1]
inline std::vector<double> MapSectionPositions(const BeamElement& elem) {
    std::vector<double> section_xi(elem.sections.size());
    for (size_t i = 0; i < elem.sections.size(); ++i) {
        section_xi[i] = 2 * elem.sections[i].position - 1;
    }
    return section_xi;
}

/// @brief Populate section mass matrix Mstar
inline void PopulateQPMstar(
    const BeamElement& elem,
    const Kokkos::View<double* [6][6], Kokkos::LayoutStride, Kokkos::HostSpace>& qp_Mstar
) {
    const auto section_xi = MapSectionPositions(elem);
    std::vector<double> weights(elem.sections.size());
    for (size_t i = 0; i < elem.quadrature.size(); ++i) {
        auto qp_xi = elem.quadrature[i][0];
        LinearInterpWeights(qp_xi, section_xi, weights);
        for (size_t m = 0; m < 6; ++m) {
            for (size_t n = 0; n < 6; ++n) {
                qp_Mstar(i, m, n) = 0.;
            }
        }
        for (size_t j = 0; j < section_xi.size(); ++j) {
            for (size_t m = 0; m < 6; ++m) {
                for (size_t n = 0; n < 6; ++n) {
                    qp_Mstar(i, m, n) += elem.sections[j].M_star[m][n] * weights[j];
                }
            }
        }
    }
}

/// @brief Populate section stiffness matrix Cstar
inline void PopulateQPCstar(
    const BeamElement& elem,
    const Kokkos::View<double* [6][6], Kokkos::LayoutStride, Kokkos::HostSpace>& qp_Cstar
) {
    const auto section_xi = MapSectionPositions(elem);
    std::vector<double> weights(elem.sections.size());
    for (size_t i = 0; i < elem.quadrature.size(); ++i) {
        auto qp_xi = elem.quadrature[i][0];
        LinearInterpWeights(qp_xi, section_xi, weights);
        for (size_t m = 0; m < 6; ++m) {
            for (size_t n = 0; n < 6; ++n) {
                qp_Cstar(i, m, n) = 0.;
            }
        }
        for (size_t j = 0; j < section_xi.size(); ++j) {
            for (size_t m = 0; m < 6; ++m) {
                for (size_t n = 0; n < 6; ++n) {
                    qp_Cstar(i, m, n) += elem.sections[j].C_star[m][n] * weights[j];
                }
            }
        }
    }
}

@ddement ddement merged commit a68ae74 into main Feb 26, 2025
10 of 18 checks passed
@ddement ddement deleted the small_problem_gpu_optimization branch February 26, 2025 20:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
code-efficiency Related to making code more efficient
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants