intel · Chao1Han · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 13, 2025
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
@@ -82,9 +82,8 @@ void checkSingleTensor(
     const at::Tensor& tensor,
     const bool p2p = false // whether operation is a P2P operation
 ) {
-  if (!tensor.is_xpu() || tensor.is_sparse() || tensor.is_complex()) {
-    C10_THROW_ERROR(
-        ValueError, "Tensors must be XPU and dense and non-complex");
+  if (!tensor.is_xpu() || tensor.is_sparse()) {
+    C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
 
     // Skip the following requirements for P2P operations
     if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
@@ -108,9 +107,8 @@ int64_t checkTensorOnSameDevice(const std::vector<at::Tensor>& tensors) {
 
   int64_t total_numel = 0;
   for (const auto& t : tensors) {
-    if (!t.is_xpu() || t.is_sparse() || t.is_complex()) {
-      C10_THROW_ERROR(
-          ValueError, "Tensors must be XPU and dense and non-complex");
+    if (!t.is_xpu() || t.is_sparse()) {
+      C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
     }
     if (t.scalar_type() != first.scalar_type()) {
       C10_THROW_ERROR(TypeError, "Tensors must have identical type");
@@ -160,6 +158,20 @@ ccl::reduction getXcclReduceOp(const ReduceOp& reduceOp, at::Tensor& input) {
   }
 }
 
+bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
+  switch (reduceOp) {
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
 void syncStream(
     at::Device& device,
     at::xpu::XPUEvent& xcclEvent,
@@ -939,6 +951,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::allreduce(
     const AllreduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "all_reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
   checkSingleTensor(tensor);
 
   // @lint-ignore CLANGTIDY
@@ -1056,6 +1076,9 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::broadcast(
     const BroadcastOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    tensor = at::view_as_real(tensor);
+  }
   checkSingleTensor(tensor);
 
   // @lint-ignore CLANGTIDY
@@ -1141,6 +1164,14 @@ c10::intrusive_ptr<Work> ProcessGroupXCCL::reduce(
     const ReduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
   checkSingleTensor(tensor);
 
   RECORD_PARAM_COMMS_DATA(

diff --git a/test/xpu/distributed/test_c10d_ops_xccl.py b/test/xpu/distributed/test_c10d_ops_xccl.py
@@ -26,6 +26,8 @@
 from test_c10d_xccl import init_multigpu_helper, requires_xccl
 from torch.testing._internal.common_distributed import MultiProcContinousTest
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
     skip_but_pass_in_sandcastle_if,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_XPU,
@@ -92,8 +94,9 @@ def test_empty_tensors(self):
         self.assertEqual(0, ys[0].numel())
 
     @requires_xccl()
+    @parametrize("dtype", [torch.float32, torch.cfloat])
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    def test_broadcast_ops(self):
+    def test_broadcast_ops(self, dtype: torch.dtype):
         pg = self.pg
 
         def broadcast(xs, rootRank, rootTensor):
@@ -107,29 +110,34 @@ def broadcast(xs, rootRank, rootTensor):
         # Every rank is root once
         for i in range(self.world_size):
             # Run with 1 input tensor
-            x = torch.tensor([self.rank]).xpu(self.rank_to_GPU[self.rank][0])
+            x = torch.tensor([self.rank], dtype=dtype).xpu(
+                self.rank_to_GPU[self.rank][0]
+            )
             output = broadcast([x], i, 0)
-            self.assertEqual(torch.tensor([i]), output[0])
+            self.assertEqual(torch.tensor([i]).to(dtype), output[0])
 
-            expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
+            expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1).to(dtype)
             xs = [
-                torch.empty([i + 1, i + 1]).fill_(-1).xpu(device=device_idx)
+                torch.empty([i + 1, i + 1]).fill_(-1).xpu(device=device_idx).to(dtype)
                 for device_idx in self.rank_to_GPU[self.rank]
             ]
 
             # test with multiple input tensors (multiple gpu in one rank)
             for j in range(len(xs)):
                 if self.rank == i:
-                    xs[j] = expected_tensor.xpu(device=self.rank_to_GPU[self.rank][j])
+                    xs[j] = expected_tensor.xpu(
+                        device=self.rank_to_GPU[self.rank][j]
+                    ).to(dtype)
 
                 broadcast(xs, i, j)
 
                 for tensor in xs:
                     self.assertEqual(tensor, expected_tensor)
 
     @requires_xccl()
+    @parametrize("dtype", [torch.float32, torch.cfloat])
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "XCCL test requires 2+ GPUs")
-    def test_allreduce_ops(self):
+    def test_allreduce_ops(self, dtype: torch.dtype):
         device_count = torch.xpu.device_count()
         pg = self.pg
         local_device_id = self.rank_to_GPU[self.rank][0]
@@ -141,23 +149,23 @@ def allreduce(tensors, op):
             work.wait()
 
         # Sum
-        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id)]
+        tensors = [torch.tensor([self.rank + 1]).xpu(local_device_id).to(dtype)]
 
         allreduce(tensors, c10d.ReduceOp.SUM)
 
         ndev = self.world_size
         self.assertEqual(
-            torch.tensor([ndev * (ndev + 1) // 2]),
+            torch.tensor([ndev * (ndev + 1) // 2]).to(dtype),
             tensors[0],
         )
 
         # Avg
-        tensors = [torch.tensor([self.rank + 1.0]).xpu(local_device_id)]
+        tensors = [torch.tensor([self.rank + 1.0]).xpu(local_device_id).to(dtype)]
 
         allreduce(tensors, c10d.ReduceOp.AVG)
         ndev = self.world_size
         self.assertEqual(
-            torch.tensor([ndev * (ndev + 1.0) / (2.0 * ndev)]),
+            torch.tensor([ndev * (ndev + 1.0) / (2.0 * ndev)]).to(dtype),
             tensors[0],
         )
 
@@ -808,6 +816,7 @@ def test_send_recv_object_list(self):
             self.assertEqual(object_list[0], 99)
 
 
+instantiate_parametrized_tests(ProcessGroupXCCLOpTest)
 if __name__ == "__main__":
     rank = int(os.getenv("RANK", -1))
     world_size = int(os.getenv("WORLD_SIZE", 2))