Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] code annotation for performance measurement #4952

Draft
wants to merge 1 commit into
base: branch-25.04
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ shuffle_and_organize_output(
std::optional<rmm::device_uvector<label_t>>&& labels,
std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank)
{
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};

if (labels) {
Expand Down Expand Up @@ -741,6 +743,10 @@ shuffle_and_organize_output(
handle.get_thrust_policy(), offsets->begin(), offsets->end(), offsets->begin());
labels = std::move(unique_labels);
}
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "\tdetail::shuffle_and_organize took " << dur0.count() << std::endl;

return std::make_tuple(std::move(majors),
std::move(minors),
Expand Down
57 changes: 57 additions & 0 deletions cpp/src/sampling/neighbor_sampling_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ neighbor_sample_impl(raft::handle_t const& handle,
bool dedupe_sources,
bool do_expensive_check)
{
if constexpr (multi_gpu) {
auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
auto const major_comm_rank = major_comm.get_rank();
auto const major_comm_size = major_comm.get_size();
auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
auto const minor_comm_rank = minor_comm.get_rank();
auto const minor_comm_size = minor_comm.get_size();
std::cout << "major_comm_rank=" << major_comm_rank << " major_comm_size=" << major_comm_size
<< " minor_comm_rank=" << minor_comm_rank << " minor_comm_size=" << minor_comm_size
<< std::endl;
}

static_assert(std::is_floating_point_v<bias_t>);

if constexpr (!multi_gpu) {
Expand Down Expand Up @@ -104,6 +116,8 @@ neighbor_sample_impl(raft::handle_t const& handle,
// to store hops

// Get the number of hop. If homogeneous neighbor sample, num_edge_types = 1.
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();

auto num_hops = raft::div_rounding_up_safe(
fan_out.size(), static_cast<size_t>(num_edge_types ? *num_edge_types : edge_type_t{1}));
Expand Down Expand Up @@ -152,6 +166,8 @@ neighbor_sample_impl(raft::handle_t const& handle,

std::vector<size_t> level_sizes{};

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
for (size_t hop = 0; hop < num_hops; ++hop) {
std::optional<std::vector<size_t>> level_Ks{std::nullopt};
std::optional<std::vector<uint8_t>> gather_flags{std::nullopt};
Expand Down Expand Up @@ -306,6 +322,8 @@ neighbor_sample_impl(raft::handle_t const& handle,
dedupe_sources,
do_expensive_check);
}
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time2 = std::chrono::steady_clock::now();

auto result_size = std::reduce(level_sizes.begin(), level_sizes.end());
size_t output_offset{};
Expand Down Expand Up @@ -416,6 +434,13 @@ neighbor_sample_impl(raft::handle_t const& handle,
output_offset += level_sizes[i];
}
}
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time3 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::chrono::duration<double> dur1 = time2 - time1;
std::chrono::duration<double> dur2 = time3 - time2;
std::cout << "\tdetail::neighbor_sample_impl (less shuffle_and_organize_output) took ("
<< dur0.count() << "," << dur1.count() << "," << dur2.count() << ")." << std::endl;

return detail::shuffle_and_organize_output(handle,
std::move(result_srcs),
Expand Down Expand Up @@ -587,6 +612,8 @@ homogeneous_uniform_neighbor_sample(
{
using bias_t = weight_t; // dummy

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
handle,
Expand All @@ -607,6 +634,12 @@ homogeneous_uniform_neighbor_sample(
sampling_flags.prior_sources_behavior,
sampling_flags.dedupe_sources,
do_expensive_check);
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "homogeneous_uniform_neighbor_sample (starting_vertices.size()="
<< starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
<< dur0.count() << std::endl;

return std::make_tuple(std::move(majors),
std::move(minors),
Expand Down Expand Up @@ -647,6 +680,8 @@ heterogeneous_uniform_neighbor_sample(
{
using bias_t = weight_t; // dummy

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
handle,
Expand All @@ -667,6 +702,12 @@ heterogeneous_uniform_neighbor_sample(
sampling_flags.prior_sources_behavior,
sampling_flags.dedupe_sources,
do_expensive_check);
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "heterogeneous_uniform_neighbor_sample (starting_vertices.size()="
<< starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
<< dur0.count() << std::endl;

return std::make_tuple(std::move(majors),
std::move(minors),
Expand Down Expand Up @@ -706,6 +747,8 @@ homogeneous_biased_neighbor_sample(
sampling_flags_t sampling_flags,
bool do_expensive_check)
{
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
handle,
Expand All @@ -725,6 +768,12 @@ homogeneous_biased_neighbor_sample(
sampling_flags.prior_sources_behavior,
sampling_flags.dedupe_sources,
do_expensive_check);
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "homogeneous_biased_neighbor_sample (starting_vertices.size()="
<< starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
<< dur0.count() << std::endl;

return std::make_tuple(std::move(majors),
std::move(minors),
Expand Down Expand Up @@ -765,6 +814,8 @@ heterogeneous_biased_neighbor_sample(
sampling_flags_t sampling_flags,
bool do_expensive_check)
{
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
handle,
Expand All @@ -784,6 +835,12 @@ heterogeneous_biased_neighbor_sample(
sampling_flags.prior_sources_behavior,
sampling_flags.dedupe_sources,
do_expensive_check);
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "heterogeneous_biased_neighbor_sample (starting_vertices.size()="
<< starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
<< dur0.count() << std::endl;

return std::make_tuple(std::move(majors),
std::move(minors),
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/sampling/sampling_post_processing_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2593,6 +2593,8 @@ renumber_and_compress_sampled_edgelist(

// 1. check input arguments

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
edgelist_majors,
edgelist_minors,
Expand Down Expand Up @@ -3183,6 +3185,10 @@ renumber_and_compress_sampled_edgelist(
}

edgelist_hops = std::nullopt;
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "renumbber_and_compress_sampled_edgelist took " << dur0.count() << std::endl;

return std::make_tuple(
doubly_compress ? std::make_optional(std::move(compressed_nzd_vertices)) : std::nullopt,
Expand Down Expand Up @@ -3232,6 +3238,8 @@ renumber_and_sort_sampled_edgelist(

// 1. check input arguments

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
edgelist_majors,
edgelist_minors,
Expand Down Expand Up @@ -3349,6 +3357,10 @@ renumber_and_sort_sampled_edgelist(
}

edgelist_hops = std::nullopt;
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "renumbber_and_sort_sampled_edgelist took " << dur0.count() << std::endl;

return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
std::move(src_is_major ? edgelist_minors : edgelist_majors),
Expand Down Expand Up @@ -3402,6 +3414,8 @@ heterogeneous_renumber_and_sort_sampled_edgelist(

// 1. check input arguments

RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
edgelist_majors,
edgelist_minors,
Expand Down Expand Up @@ -3560,6 +3574,10 @@ heterogeneous_renumber_and_sort_sampled_edgelist(

edgelist_edge_types = std::nullopt;
edgelist_hops = std::nullopt;
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "heterogeneous_renumbber_and_sort_sampled_edgelist took " << dur0.count() << std::endl;

return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
std::move(src_is_major ? edgelist_minors : edgelist_majors),
Expand Down Expand Up @@ -3595,6 +3613,8 @@ sort_sampled_edgelist(raft::handle_t const& handle,
bool src_is_major,
bool do_expensive_check)
{
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time0 = std::chrono::steady_clock::now();
using label_index_t = uint32_t;
using vertex_type_t = uint32_t; // dummy

Expand Down Expand Up @@ -3692,6 +3712,10 @@ sort_sampled_edgelist(raft::handle_t const& handle,
}

edgelist_hops = std::nullopt;
RAFT_CUDA_TRY(cudaDeviceSynchronize());
auto time1 = std::chrono::steady_clock::now();
std::chrono::duration<double> dur0 = time1 - time0;
std::cout << "sort_sampled_edgelist took " << dur0.count() << std::endl;

return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
std::move(src_is_major ? edgelist_minors : edgelist_majors),
Expand Down