Revert "[PGNCCL] Make sure we do not use split for P2P comm creation (pytorch#139013)"

pytorchmergebot · pytorchmergebot · commit 02339e674d88 · 2024-10-28T21:30:28.000Z
This reverts commit 74878ac. Reverted pytorch#139013 on behalf of https://github.com/ZainRizvi due to Sorry but this appears to be breaking on trunk. See: distributed/_composable/test_composability/test_pp_composability.py::ComposabilityTest::test_manual_with_data_parallel_dp_type_DDP_ScheduleClass0_use_new_runtime_False [GH job link](https://github.com/pytorch/pytorch/actions/runs/11559910615/job/32177150816) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/74878ac271feecfa3ff3d32f78c7d889bcac97d6) ([comment](pytorch#139013 (comment)))
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -982,28 +982,6 @@ def test_non_blocking_p2p(self):
             self.assertEqual(send_tensor, recv_tensor)
         dist.destroy_process_group()
 
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("eager_init", [True, False])
-    def test_subgroup_p2p(self, eager_init: bool):
-        store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
-        c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-            device_id=device if eager_init else None,
-        )
-        send_tensor = torch.ones(10, 10, device=device)
-        group = dist.new_group()
-        if self.rank == 0:
-            dist.send(send_tensor, 1, group=group)
-        if self.rank == 1:
-            recv_tensor = torch.rand(10, 10, device=device)
-            dist.recv(recv_tensor, 0, group=group)
-            self.assertEqual(send_tensor, recv_tensor)
-        dist.destroy_process_group()
-
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_get_uid(self):
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2401,12 +2401,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
 #endif
 
 #ifdef NCCL_HAS_COMM_SPLIT
-  // Use split to create a new communicator only if:
-  // 1. The parent comm is known; AND
-  // 2. The new comm is not for a point-to-point operation.
-  // ncclCommSplit() is a collective call, so it does not work for P2P
-  // operations.
-  if (options_->split_from && !singleP2POp) {
+  if (options_->split_from) {
     // Find a valid, healthy communicator to split from if possible.
     std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
     auto& other_comms = options_->split_from->devNCCLCommMap_;