Add batched sparse CSR/CSC/BSR/BSC to sparse COO conversion support (pytorch#116206)

pearu · pytorchmergebot · commit 4a37f57c69b5 · 2024-01-07T19:42:02.000Z
As in the title. Fixes pytorch#104868 Pull Request resolved: pytorch#116206 Approved by: https://github.com/amjames, https://github.com/lezcano, https://github.com/cpuhrsch
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
@@ -1513,24 +1513,38 @@ void convert_indices_from_csr_to_coo_cpu(
     const Tensor& crow_indices,
     const Tensor& col_indices,
     const bool transpose = false) {
-  int64_t nrows = crow_indices.numel() - 1;
-  if (nrows == 0) {
-    indices.zero_();
+  int64_t nrows = crow_indices.size(-1) - 1;
+  int64_t nnz = col_indices.size(-1);
+  if (nrows == 0 || nnz == 0) {
+    indices.zero_();  // is this needed as indices has a zero-valued
+                      // dimension when nrows or nnz is 0?
     return;
   }
   auto crow_indices_ = crow_indices.expect_contiguous();
+  int64_t total_nnz = col_indices.numel();
+  int64_t batch_ndim = crow_indices.dim() - 1;
+  if (batch_ndim > 0) {
+    auto batch_indices = indices.narrow(0, 0, batch_ndim);
+    batch_indices.copy_(batch_indices.new_ones(crow_indices.sizes().slice(0, batch_ndim))
+                        .nonzero()
+                        .transpose(0, 1)
+                        .repeat_interleave(nnz, 1));
+  }
   const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
   TORCH_INTERNAL_ASSERT(indices.is_contiguous());
-  auto row0 = indices.select(0, transpose ? 1 : 0);
-  auto row1 = indices.select(0, transpose ? 0 : 1);
+  auto row0 = indices.select(0, transpose ? batch_ndim + 1 : batch_ndim + 0);
+  auto row1 = indices.select(0, transpose ? batch_ndim + 0 : batch_ndim + 1);
   output_t* data_out = row0.data_ptr<output_t>();
-  row1.copy_(*col_indices.expect_contiguous());
+  auto col_indices_ = col_indices.expect_contiguous();
+  row1.copy_(col_indices_->view({-1}));
   at::parallel_for(
-      0, nrows, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
-        for (const auto i : c10::irange(start, end)) {
+                   0, nrows * total_nnz / nnz, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) {
+        for (const auto i_  : c10::irange(start, end)) {
+          auto b = i_ / nrows;
+          auto i = i_ % nrows;
           std::fill(
-              &data_out[crow_indices_data_in[i]],
-              &data_out[crow_indices_data_in[i + 1]],
+              &data_out[b * nnz + crow_indices_data_in[b * (nrows + 1) + i]],
+              &data_out[b * nnz + crow_indices_data_in[b * (nrows + 1) + i + 1]],
               static_cast<output_t>(i));
         }
       });
@@ -1829,27 +1843,30 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim)
   Tensor values;
   Tensor indices = at::_convert_indices_from_csr_to_coo(compressed_indices, plain_indices,
                                                         false, (layout == kSparseCsc || layout == kSparseBsc));
+  const auto batch_ndim = compressed_indices.dim() - 1;
   // Only CSR is trivially coalesced
   bool coalesced = layout == kSparseCsr || self.numel() == 0 || self._nnz() == 1;
   AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "sparse_compressed_to_sparse",
-    [&] { values = self.values(); },
+    [&] { values = self.values().flatten(0, batch_ndim); },
     [&] {
-      auto size = DimVector(self.sizes().slice(0, 2));
-      auto blocksize = DimVector(self.values().sizes().slice(1, 2));
-
+      auto blocksize = DimVector(self.values().sizes().slice(batch_ndim + 1, 2));
+      DimVector batch_blocksize;
+      batch_blocksize.append(batch_ndim, 1);
+      batch_blocksize.append(blocksize);
       const auto max_blocksize = std::max(blocksize[0], blocksize[1]);
       const auto max_blocksize_arange = at::arange(max_blocksize, indices.options());
       const auto blocksize_arange_0 = max_blocksize_arange.narrow(-1, 0, blocksize[0]);
       const auto blocksize_arange_1 = max_blocksize_arange.narrow(-1, 0, blocksize[1]);
-      const auto block_coo_indices = at::stack({
+      const auto block_coo_indices_ = at::stack({
           blocksize_arange_0.unsqueeze(-1).expand({-1, blocksize[1]}),
           blocksize_arange_1.unsqueeze(0).expand({blocksize[0], -1})
-      }).flatten(-2, -1);
-
+      }).flatten(-2, -1);  // equivalent to torch.ones(blocksize).nonzero().T
+      const auto block_coo_indices = at::zeros({batch_ndim + 2, blocksize[0] * blocksize[1]}, indices.options());
+      block_coo_indices.narrow(0, batch_ndim, 2).copy_(block_coo_indices_);
       indices = indices
         // Scale indices that identify blocks to element-wise coordinates that correspond
         // to the top-left corner of each block.
-        .mul(at::tensor(blocksize, indices.options()).unsqueeze_(-1))
+        .mul(at::tensor(batch_blocksize, indices.options()).unsqueeze_(1))
         // Now that we know top-left block coordinates, we offset them with element-wise
         // coordinates in the block to get the result.
         // NOTE: indices is mapped from (dim, nnz) to (dim, nnz, 1),
@@ -1861,10 +1878,10 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim)
         // to produce valid nnz dimension of a COO tensor.
         .flatten(-2, -1);
 
-      values = self.values().flatten(0, 2);
+      values = self.values().flatten(0, batch_ndim + 2);
 
       // BSRs not spanning across several rows produces coalesced results.
-      coalesced |= (layout == kSparseBsr && blocksize[0] == 1);
+      coalesced |= (layout == kSparseBsr && blocksize[0] == 1 && batch_ndim == 0);
     });
   return at::native::_sparse_coo_tensor_unsafe(indices, values, self.sizes())._coalesced_(coalesced);
 }
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -147,21 +147,18 @@ TORCH_META_FUNC(_convert_indices_from_csr_to_coo)
  const bool out_int32,
  const bool transpose) {
   TORCH_CHECK(
-    crow_indices.dim() == 1, "crow_indices is supposed to be a vector, but got ",
-    crow_indices.dim(), " dimensional tensor.");
-  TORCH_CHECK(col_indices.dim() == 1, "col_indices is supposed to be a vector, but got ",
-              col_indices.dim(), " dimensional tensor.");
+    crow_indices.dim() == col_indices.dim(), "crow_indices and col_indices are supposed to have"
+    " the same dimensionality, but got ", crow_indices.dim(), " and ",
+    crow_indices.dim(), " dimensional tensors, respectively.");
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = crow_indices.options().dtype(scalar_type);
-  set_output_raw_strided(0, {2, col_indices.numel()}, {}, options, {});
+  set_output_raw_strided(0, {col_indices.dim() + 1, col_indices.numel()}, {}, options, {});
 }
 
 } // namespace meta
 
 namespace {
 
-constexpr int64_t GRAIN_SIZE = at::internal::GRAIN_SIZE;
-
 template <typename F>
 Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) {
   TORCH_INTERNAL_ASSERT(self.is_sparse_csr());
@@ -194,34 +191,6 @@ Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) {
   return self;
 }
 
-template <typename input_t, typename output_t>
-void convert_indices_from_csr_to_coo_cpu(
-    const Tensor& indices,
-    const Tensor& crow_indices,
-    const Tensor& col_indices,
-    const bool transpose = false) {
-  int64_t nrows = crow_indices.numel() - 1;
-  if (nrows == 0) {
-    indices.zero_();
-    return;
-  }
-  auto crow_indices_ = crow_indices.expect_contiguous();
-  const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
-  TORCH_INTERNAL_ASSERT(indices.is_contiguous());
-  auto row0 = indices.select(0, transpose ? 1 : 0);
-  auto row1 = indices.select(0, transpose ? 0 : 1);
-  output_t* data_out = row0.data_ptr<output_t>();
-  row1.copy_(*col_indices.expect_contiguous());
-  at::parallel_for(0, nrows, GRAIN_SIZE, [&](int64_t start, int64_t end) {
-    for (const auto i : c10::irange(start, end)) {
-      std::fill(
-          &data_out[crow_indices_data_in[i]],
-          &data_out[crow_indices_data_in[i + 1]],
-          static_cast<output_t>(i));
-    }
-  });
-}
-
 } // end anonymous namespace
 
 namespace native {
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -85,36 +85,51 @@ void convert_indices_from_coo_to_csr_cuda(const Tensor& result, const Tensor& in
 }
 
 template <typename input_t, typename output_t>
-__global__ void convert_indices_from_csr_to_coo_cuda_kernel(output_t* data_out, const input_t* data_in, const int64_t nrows) {
+__global__ void convert_indices_from_csr_to_coo_cuda_kernel(output_t* data_out, const input_t* data_in, const int64_t nrows, const int64_t nnz, const int64_t nbatches) {
   int64_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
-  if (tid < nrows) {
-    for (int64_t i = data_in[tid]; i < data_in[tid + 1]; i++)
-      data_out[i] = static_cast<output_t>(tid);
+  if (tid < nrows * nbatches) {
+    int64_t b = tid / nrows;
+    int64_t i_ = b * (nrows + 1) + tid % nrows;
+    for (int64_t i = data_in[i_]; i < data_in[i_ + 1]; i++) {
+      data_out[b * nnz + i] = static_cast<output_t>(tid % nrows);
+    }
   }
 }
 
 template <typename input_t, typename output_t>
 void convert_indices_from_csr_to_coo_cuda(const Tensor& indices, const Tensor& crow_indices, const Tensor& col_indices, const bool transpose=false) {
-  int64_t nrows = crow_indices.numel() - 1;
-  if (nrows == 0) {
+  int64_t nrows = crow_indices.size(-1) - 1;
+  int64_t nnz = col_indices.size(-1);
+  if (nrows == 0 || nnz == 0) {
     indices.zero_();
     return;
   }
+  int64_t total_nnz = col_indices.numel();
+  int64_t batch_ndim = crow_indices.dim() - 1;
+  if (batch_ndim > 0) {
+    auto batch_indices = indices.narrow(0, 0, batch_ndim);
+    batch_indices.copy_(batch_indices.new_ones(crow_indices.sizes().slice(0, batch_ndim))
+                        .nonzero()
+                        .transpose(0, 1)
+                        .repeat_interleave(nnz, 1));
+  }
 
   auto crow_indices_ = crow_indices.expect_contiguous();
   const input_t* crow_indices_data_in = crow_indices_->data_ptr<input_t>();
   TORCH_INTERNAL_ASSERT(indices.is_contiguous());
-  auto row0 = indices.select(0, transpose?1:0);
-  auto row1 = indices.select(0, transpose?0:1);
+  auto row0 = indices.select(0, transpose?batch_ndim + 1:batch_ndim + 0);
+  auto row1 = indices.select(0, transpose?batch_ndim + 0:batch_ndim + 1);
+  auto col_indices_ = col_indices.expect_contiguous();
+  row1.copy_(col_indices_->view({-1}));
   output_t* data_out = row0.data_ptr<output_t>();
 
-  // Run nrows threads...
+  // Run nrows * nbatches threads...
+  int64_t nbatches = total_nnz / nnz;
   int64_t THREADS = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
-  int64_t BLOCKS = (nrows + THREADS) / THREADS;
+  int64_t BLOCKS = (nrows * nbatches + THREADS) / THREADS;
   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-  row1.copy_(*col_indices.expect_contiguous());
-  convert_indices_from_csr_to_coo_cuda_kernel<<<BLOCKS, THREADS, 0, stream>>>(data_out, crow_indices_data_in, nrows);
+  convert_indices_from_csr_to_coo_cuda_kernel<<<BLOCKS, THREADS, 0, stream>>>(data_out, crow_indices_data_in, nrows, nnz, nbatches);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
@@ -4660,15 +4660,6 @@ def explicit_to_sparse(x):
                         r"conversion from Sparse to .* for input tensors with sparse_dim\(\)!=2 is not supported"):
                     explicit_to_sparse(t)
                 continue
-            elif from_layout in {torch.sparse_csr, torch.sparse_csc,
-                                 torch.sparse_bsr, torch.sparse_bsc} and to_layout is torch.sparse_coo and is_batch:
-                with self.assertRaisesRegex(RuntimeError,
-                                            "crow_indices is supposed to be a vector, but got \\d+ dimensional tensor"):
-                    t.to_sparse(layout=to_layout, blocksize=blocksize)
-                with self.assertRaisesRegex(RuntimeError,
-                                            "crow_indices is supposed to be a vector, but got \\d+ dimensional tensor"):
-                    explicit_to_sparse(t)
-                continue
             elif (from_layout, to_layout) in {(torch.sparse_bsc, torch.sparse_csr), (torch.sparse_bsc, torch.sparse_csc),
                                               (torch.sparse_bsr, torch.sparse_csr), (torch.sparse_bsr, torch.sparse_csc)}:
                 with self.assertRaisesRegex(
diff --git a/torch/testing/_internal/opinfo/definitions/sparse.py b/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -574,7 +574,10 @@ def _validate_sample_input_elementwise_binary_sparse_mul(sample):
     if layout is torch.sparse_csr and batch_dim > 0 and t_args[0].ndim > 0:
         return ErrorInput(
             sample,
-            error_regex="crow_indices is supposed to be a vector, but got 2 dimensional tensor",
+            error_regex=(
+                "coo_to_sparse_csr: conversion from Sparse to SparseCsr for input"
+                " tensors with sparse_dim[(][)]!=2 is not supported"
+            ),
         )
     elif layout is torch.sparse_csc and t_args[0].ndim > 0:
         return ErrorInput(