rapidsai · seunghwak · Feb 27, 2025
@@ -207,6 +207,8 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<label_t>>&& labels,
   std::optional<raft::device_span<int32_t const>> label_to_output_comm_rank)
 {
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
 
   if (labels) {
@@ -741,6 +743,10 @@ shuffle_and_organize_output(
       handle.get_thrust_policy(), offsets->begin(), offsets->end(), offsets->begin());
     labels = std::move(unique_labels);
   }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "\tdetail::shuffle_and_organize took " << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(majors),
                          std::move(minors),

@@ -72,6 +72,18 @@ neighbor_sample_impl(raft::handle_t const& handle,
                      bool dedupe_sources,
                      bool do_expensive_check)
 {
+  if constexpr (multi_gpu) {
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_rank = major_comm.get_rank();
+    auto const major_comm_size = major_comm.get_size();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_rank = minor_comm.get_rank();
+    auto const minor_comm_size = minor_comm.get_size();
+    std::cout << "major_comm_rank=" << major_comm_rank << " major_comm_size=" << major_comm_size
+              << " minor_comm_rank=" << minor_comm_rank << " minor_comm_size=" << minor_comm_size
+              << std::endl;
+  }
+
   static_assert(std::is_floating_point_v<bias_t>);
 
   if constexpr (!multi_gpu) {
@@ -104,6 +116,8 @@ neighbor_sample_impl(raft::handle_t const& handle,
                                                                               // to store hops
 
   // Get the number of hop. If homogeneous neighbor sample, num_edge_types = 1.
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
 
   auto num_hops = raft::div_rounding_up_safe(
     fan_out.size(), static_cast<size_t>(num_edge_types ? *num_edge_types : edge_type_t{1}));
@@ -152,6 +166,8 @@ neighbor_sample_impl(raft::handle_t const& handle,
 
   std::vector<size_t> level_sizes{};
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1 = std::chrono::steady_clock::now();
   for (size_t hop = 0; hop < num_hops; ++hop) {
     std::optional<std::vector<size_t>> level_Ks{std::nullopt};
     std::optional<std::vector<uint8_t>> gather_flags{std::nullopt};
@@ -306,6 +322,8 @@ neighbor_sample_impl(raft::handle_t const& handle,
         dedupe_sources,
         do_expensive_check);
   }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time2 = std::chrono::steady_clock::now();
 
   auto result_size = std::reduce(level_sizes.begin(), level_sizes.end());
   size_t output_offset{};
@@ -416,6 +434,13 @@ neighbor_sample_impl(raft::handle_t const& handle,
       output_offset += level_sizes[i];
     }
   }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time3                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::chrono::duration<double> dur1 = time2 - time1;
+  std::chrono::duration<double> dur2 = time3 - time2;
+  std::cout << "\tdetail::neighbor_sample_impl (less shuffle_and_organize_output) took ("
+            << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")." << std::endl;
 
   return detail::shuffle_and_organize_output(handle,
                                              std::move(result_srcs),
@@ -587,6 +612,8 @@ homogeneous_uniform_neighbor_sample(
 {
   using bias_t = weight_t;  // dummy
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
     detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
       handle,
@@ -607,6 +634,12 @@ homogeneous_uniform_neighbor_sample(
       sampling_flags.prior_sources_behavior,
       sampling_flags.dedupe_sources,
       do_expensive_check);
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "homogeneous_uniform_neighbor_sample (starting_vertices.size()="
+            << starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
+            << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(majors),
                          std::move(minors),
@@ -647,6 +680,8 @@ heterogeneous_uniform_neighbor_sample(
 {
   using bias_t = weight_t;  // dummy
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
     detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
       handle,
@@ -667,6 +702,12 @@ heterogeneous_uniform_neighbor_sample(
       sampling_flags.prior_sources_behavior,
       sampling_flags.dedupe_sources,
       do_expensive_check);
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "heterogeneous_uniform_neighbor_sample (starting_vertices.size()="
+            << starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
+            << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(majors),
                          std::move(minors),
@@ -706,6 +747,8 @@ homogeneous_biased_neighbor_sample(
   sampling_flags_t sampling_flags,
   bool do_expensive_check)
 {
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
     detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
       handle,
@@ -725,6 +768,12 @@ homogeneous_biased_neighbor_sample(
       sampling_flags.prior_sources_behavior,
       sampling_flags.dedupe_sources,
       do_expensive_check);
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "homogeneous_biased_neighbor_sample (starting_vertices.size()="
+            << starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
+            << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(majors),
                          std::move(minors),
@@ -765,6 +814,8 @@ heterogeneous_biased_neighbor_sample(
   sampling_flags_t sampling_flags,
   bool do_expensive_check)
 {
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   auto [majors, minors, weights, edge_ids, edge_types, hops, labels, offsets] =
     detail::neighbor_sample_impl<vertex_t, edge_t, weight_t, edge_type_t, bias_t>(
       handle,
@@ -784,6 +835,12 @@ heterogeneous_biased_neighbor_sample(
       sampling_flags.prior_sources_behavior,
       sampling_flags.dedupe_sources,
       do_expensive_check);
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "heterogeneous_biased_neighbor_sample (starting_vertices.size()="
+            << starting_vertices.size() << " # edges sampled=" << majors.size() << ") took "
+            << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(majors),
                          std::move(minors),

@@ -2593,6 +2593,8 @@ renumber_and_compress_sampled_edgelist(
 
   // 1. check input arguments
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
                                                             edgelist_majors,
                                                             edgelist_minors,
@@ -3183,6 +3185,10 @@ renumber_and_compress_sampled_edgelist(
   }
 
   edgelist_hops = std::nullopt;
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "renumbber_and_compress_sampled_edgelist took " << dur0.count() << std::endl;
 
   return std::make_tuple(
     doubly_compress ? std::make_optional(std::move(compressed_nzd_vertices)) : std::nullopt,
@@ -3232,6 +3238,8 @@ renumber_and_sort_sampled_edgelist(
 
   // 1. check input arguments
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
                                                             edgelist_majors,
                                                             edgelist_minors,
@@ -3349,6 +3357,10 @@ renumber_and_sort_sampled_edgelist(
   }
 
   edgelist_hops = std::nullopt;
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "renumbber_and_sort_sampled_edgelist took " << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
                          std::move(src_is_major ? edgelist_minors : edgelist_majors),
@@ -3402,6 +3414,8 @@ heterogeneous_renumber_and_sort_sampled_edgelist(
 
   // 1. check input arguments
 
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   check_input_edges<label_index_t, vertex_t, vertex_type_t>(handle,
                                                             edgelist_majors,
                                                             edgelist_minors,
@@ -3560,6 +3574,10 @@ heterogeneous_renumber_and_sort_sampled_edgelist(
 
   edgelist_edge_types = std::nullopt;
   edgelist_hops       = std::nullopt;
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "heterogeneous_renumbber_and_sort_sampled_edgelist took " << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
                          std::move(src_is_major ? edgelist_minors : edgelist_majors),
@@ -3595,6 +3613,8 @@ sort_sampled_edgelist(raft::handle_t const& handle,
                       bool src_is_major,
                       bool do_expensive_check)
 {
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time0 = std::chrono::steady_clock::now();
   using label_index_t = uint32_t;
   using vertex_type_t = uint32_t;  // dummy
 
@@ -3692,6 +3712,10 @@ sort_sampled_edgelist(raft::handle_t const& handle,
   }
 
   edgelist_hops = std::nullopt;
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  auto time1                         = std::chrono::steady_clock::now();
+  std::chrono::duration<double> dur0 = time1 - time0;
+  std::cout << "sort_sampled_edgelist took " << dur0.count() << std::endl;
 
   return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
                          std::move(src_is_major ? edgelist_minors : edgelist_majors),