Avoid COW materialize in various operations (pytorch#119506)

kurtamohler · pytorchmergebot · commit 90dabff260b4 · 2024-02-09T14:47:19.000Z
Operations affected include dot, cross, scatter/gather, shape, sort, triangular, unary, scalar, pad, complex, to_list, fft Pull Request resolved: pytorch#119506 Approved by: https://github.com/ezyang ghstack dependencies: pytorch#119501, pytorch#119502, pytorch#119503, pytorch#119504
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
@@ -185,7 +185,7 @@ Tensor dot(const Tensor &self, const Tensor &other){
 
   return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "dot", [&] {
     Tensor result = at::empty({}, self.options());
-    result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
+    result.fill_(dot_impl<scalar_t>(self.numel(), const_cast<scalar_t*>(self.const_data_ptr<scalar_t>()), self.stride(0), const_cast<scalar_t*>(other.const_data_ptr<scalar_t>()), other.stride(0)));
     return result;
   });
 }
@@ -216,7 +216,7 @@ Tensor vdot(const Tensor &self, const Tensor &other){
 
   return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] {
     Tensor result = at::empty({}, self.options());
-    result.fill_(vdot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
+    result.fill_(vdot_impl<scalar_t>(self.numel(), const_cast<scalar_t*>(self.const_data_ptr<scalar_t>()), self.stride(0), const_cast<scalar_t *>(other.const_data_ptr<scalar_t>()), other.stride(0)));
     return result;
   });
 
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
@@ -40,7 +40,7 @@ Scalar _local_scalar_dense_cpu(const Tensor& self) {
     self.scalar_type(),
     "_local_scalar_dense_cpu",
     AT_WRAP([&] {
-      scalar_t value = *self.data_ptr<scalar_t>();
+      scalar_t value = *self.const_data_ptr<scalar_t>();
       r = Scalar(value);
     }),
     AT_EXPAND(AT_SD_TYPES)
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
@@ -546,7 +546,7 @@ std::tuple<Tensor&, Tensor&> median_with_indices_impl(
     .declare_static_shape(sizes, /*squash_dims=*/dim)
     .add_output(vals)
     .add_output(inds)
-    .add_input(in)
+    .add_const_input(in)
     .build();
 
   AT_DISPATCH_ALL_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, in.scalar_type(), "median_out", [&] {
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
@@ -214,8 +214,8 @@ Tensor& complex_out(const Tensor& real, const Tensor& imag, Tensor& result) {
   complex_check_dtype(result, real, imag);
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(real)
-      .add_input(imag)
+      .add_const_input(real)
+      .add_const_input(imag)
       .check_all_same_dtype(false)
       .build();
   complex_stub(iter.device_type(), iter);
@@ -234,8 +234,8 @@ Tensor& polar_out(const Tensor& abs, const Tensor& angle, Tensor& result) {
   complex_check_dtype(result, abs, angle);
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(abs)
-      .add_input(angle)
+      .add_const_input(abs)
+      .add_const_input(angle)
       .check_all_same_dtype(false)
       .build();
   polar_stub(iter.device_type(), iter);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
@@ -554,7 +554,7 @@ static void fastCatOutDim0(const Tensor& out, const MaterializedITensorListRef&
   for (const Tensor& input : inputs) {
     TORCH_CHECK(outBytes >= totalBytes);
     if (input.nbytes() > 0) {
-      std::memcpy(dataPtr + totalBytes, input.data_ptr(), input.nbytes());
+      std::memcpy(dataPtr + totalBytes, input.const_data_ptr(), input.nbytes());
     }
     totalBytes += input.nbytes();
   }
@@ -609,18 +609,18 @@ TORCH_IMPL_FUNC(cat_out_cpu)
       .set_check_mem_overlap(false)
       .resize_outputs(false)
       .add_output(result_slice)
-      .add_input(source_slice)
+      .add_const_input(source_slice)
       .enforce_safe_casting_to_output(true)
       .build();
 
     for (const Tensor& tensor : materialized) {
       if (cat_should_skip_tensor(tensor)) {
         continue;
       }
-      auto source_data = static_cast<char*>(tensor.data_ptr());
+      auto source_data = static_cast<const char*>(tensor.const_data_ptr());
       auto result_data = static_cast<char*>(result_slice_data) + offset * result_stride_bytes;
       iter.unsafe_replace_operand(0, result_data);
-      iter.unsafe_replace_operand(1, source_data);
+      iter.unsafe_replace_operand(1, const_cast<char*>(source_data));
       copy_stub(iter.device_type(), iter, false);
       offset += slice_dim_size;
     }
@@ -636,7 +636,7 @@ TORCH_IMPL_FUNC(cat_out_cpu)
         .set_check_mem_overlap(false)  // Already checked above
         .resize_outputs(false)
         .add_output(result_slice)
-        .add_input(tensor)
+        .add_const_input(tensor)
         .promote_inputs_to_common_dtype(true)
         .cast_common_dtype_to_outputs(true)
         .enforce_safe_casting_to_output(true)
@@ -1004,7 +1004,7 @@ std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indice
     int64_t sections = tensor_indices_or_sections.item<int64_t>();
     return self.tensor_split(sections, dim);
   } else {
-    auto indices_data = tensor_indices_or_sections.data_ptr<int64_t>();
+    auto indices_data = tensor_indices_or_sections.const_data_ptr<int64_t>();
     auto stride = tensor_indices_or_sections.stride(0);
     auto numel = tensor_indices_or_sections.numel();
     std::vector<int64_t> indices(numel);
@@ -1344,22 +1344,22 @@ Tensor& narrow_copy_dense_cpu_out(
     return output;
   }
 
-  char* src_bytes = static_cast<char*>(self_contig->data_ptr());
+  const char* src_bytes = static_cast<const char*>(self_contig->const_data_ptr());
   char* dst_bytes = static_cast<char*>(output.data_ptr());
 
   size_t src_block_size_bytes = itemsize * src_block_size;
   size_t dst_block_size_bytes = itemsize * dst_block_size;
   size_t src_offset = unit * start;
 
-  char* src_offset_bytes = src_bytes + itemsize * src_offset;
+  const char* src_offset_bytes = src_bytes + itemsize * src_offset;
   char* dst_offset_bytes = dst_bytes;
 
   for (const auto i : c10::irange(num_blocks)) {
-    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
+    const char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
     char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(src_bytes + src_nbytes));
+        static_cast<const void*>(local_src_offset_bytes + dst_block_size_bytes) <=
+        static_cast<const void*>(src_bytes + src_nbytes));
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
         static_cast<void*>(dst_bytes + dst_nbytes));
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
@@ -63,8 +63,8 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
     .check_all_same_dtype(false)
     .declare_static_dtype_and_device(self.scalar_type(), self.device())
     .add_output(out_tensor)
-    .add_input(self)
-    .add_input(restrided_self)
+    .add_const_input(self)
+    .add_const_input(restrided_self)
     .build();
 
   auto* data = reinterpret_cast<char*>(iter.data_ptr(0));
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
@@ -41,7 +41,7 @@ namespace {
 template <typename scalar_t>
 void apply_triu_tril_single(
     scalar_t* result,
-    scalar_t* self,
+    const scalar_t* self,
     bool inplace,
     int64_t k,
     int64_t n,
@@ -86,7 +86,7 @@ template <typename scalar_t>
 void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) {
   auto n = self.size(-2);
   auto m = self.size(-1);
-  auto self_data = self.data_ptr<scalar_t>();
+  auto self_data = self.const_data_ptr<scalar_t>();
   auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1;
   auto batchsize = batchCountTrilTriu(result);
   auto self_row_stride = self.stride(-2);
@@ -107,7 +107,7 @@ void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int
 
   parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) {
     for (const auto b : c10::irange(start, end)) {
-      scalar_t* self_batch = &self_data[b * self_stride];
+      const scalar_t* self_batch = &self_data[b * self_stride];
       scalar_t* result_batch = &result_data[b * result_stride];
       apply_triu_tril_single<scalar_t>(
           result_batch,
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
@@ -868,7 +868,7 @@ Tensor& logical_not_out(const Tensor& self, Tensor& result) {
   TensorIterator iter = TensorIteratorConfig()
     .check_all_same_dtype(false)
     .add_output(result)
-    .add_input(self)
+    .add_const_input(self)
     .build();
   logical_not_stub(iter.device_type(), iter);
   return result;
@@ -964,7 +964,7 @@ std::tuple<Tensor&, Tensor&> frexp_out(const Tensor& self,
   auto iter = TensorIteratorConfig()
     .add_output(mantissa)
     .add_output(exponent)
-    .add_input(self)
+    .add_const_input(self)
     .check_all_same_dtype(false)
     .set_check_mem_overlap(true)
     .build();
diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp
@@ -12,11 +12,11 @@ namespace at::native {
 namespace {
 
 struct InputMeta {
-  void* data_ptr;
+  const void* data_ptr;
   int64_t inner_size;
 
   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
-    : data_ptr(t.data_ptr())
+    : data_ptr(t.const_data_ptr())
     , inner_size(t.sizes()[dim] * inner) {}
 };
 
@@ -38,7 +38,7 @@ void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListR
   for (const auto i : c10::irange(outer)) {
     for (const auto j : c10::irange(ninputs)) {
       int64_t local_inner = inputs[j].inner_size;
-      scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
+      const scalar_t* input_ptr = (const scalar_t*)(inputs[j].data_ptr) + i * local_inner;
       int64_t d = 0;
       for (; d < local_inner - (local_inner % Vec::size()); d += Vec::size()) {
         Vec in_vec = Vec::loadu(input_ptr + d);
diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp
@@ -21,8 +21,8 @@ static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b,
   int64_t b_stride = b.stride(dim);
   int64_t r_stride = result.stride(dim);
 
-  scalar_t *a_ptr = a.data_ptr<scalar_t>();
-  scalar_t *b_ptr = b.data_ptr<scalar_t>();
+  const scalar_t *a_ptr = a.const_data_ptr<scalar_t>();
+  const scalar_t *b_ptr = b.const_data_ptr<scalar_t>();
   scalar_t *r_ptr = result.data_ptr<scalar_t>();
 
   parallel_for(0, total, internal::GRAIN_SIZE, [&](int64_t s, int64_t e) {
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -136,7 +136,7 @@ void cpu_padding(
   auto input = input_.contiguous();
   auto output = output_.contiguous();
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   // fold nbatch and channels into single dimension for channels first.
@@ -158,7 +158,7 @@ void cpu_padding(
 
   // do vectorized copy whe output is overlapped with input on W,
   // only applies to positive padding
-  auto loop = [=](scalar_t* out, scalar_t* in, bool positive_padding) {
+  auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
     if (positive_padding) {
       for (const auto ow : c10::irange(pad_w)) {
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
@@ -198,7 +198,7 @@ void cpu_padding(
       for (const auto i : c10::irange(begin, end)) {
         int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h);
         scalar_t* output_ptr = output_data + i * output_width;
-        scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width;
+        const scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width;
 
         loop(output_ptr, input_ptr, p.is_padding_positive_width);
         data_index_step(c, channels, oh, output_height);
@@ -214,7 +214,7 @@ void cpu_padding(
         int64_t id = PaddingType::index(od, input_depth, pad_d, offset_d);
         int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h);
         scalar_t* output_ptr = output_data + i * output_width;
-        scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width +
+        const scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width;
 
         loop(output_ptr, input_ptr, p.is_padding_positive_width);
@@ -243,7 +243,7 @@ void cpu_padding_channels_last(
   auto input = input_.contiguous(memory_format);
   auto output = output_.contiguous(memory_format);
 
-  auto input_data = input.data_ptr<scalar_t>();
+  auto input_data = input.const_data_ptr<scalar_t>();
   auto output_data = output.data_ptr<scalar_t>();
 
   int64_t nbatch = p.nbatch;
@@ -274,7 +274,7 @@ void cpu_padding_channels_last(
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
 
         scalar_t* output_ptr = output_data + i * channels;
-        scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels;
+        const scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels;
         copy_stub(output_ptr, input_ptr, channels);
 
         data_index_step(n, nbatch, oh, output_height, ow, output_width);
@@ -292,7 +292,7 @@ void cpu_padding_channels_last(
         int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w);
 
         scalar_t* output_ptr = output_data + i * channels;
-        scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width +
+        const scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width +
             id * input_height * input_width + ih * input_width + iw) * channels;
         copy_stub(output_ptr, input_ptr, channels);
 
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -186,7 +186,7 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(index)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -273,8 +273,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -369,8 +369,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -464,8 +464,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -560,8 +560,8 @@ struct cpu_scatter_gather_base_kernel {
       // NOLINTNEXTLINE(bugprone-argument-comment)
       .declare_static_shape(index.sizes(), /*squash_dim=*/dim)
       .add_output(buffer)
-      .add_input(src)
-      .add_input(index)
+      .add_const_input(src)
+      .add_const_input(index)
       .build();
 
     auto self_dim_stride = ensure_nonempty_stride(buffer, dim);
@@ -687,9 +687,9 @@ std::pair<K*, V*> radix_sort_parallel(
 
 template <typename scalar_t, ReductionType reduce>
 void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index, const Tensor& src, bool include_self) {
-  int64_t* index_data = index.data_ptr<int64_t>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* self_data = self.data_ptr<scalar_t>();
-  scalar_t* src_data = src.data_ptr<scalar_t>();
+  const scalar_t* src_data = src.const_data_ptr<scalar_t>();
 
   const int64_t M = ensure_nonempty_size(self, 0);
   const int64_t nnz = ensure_nonempty_size(index, 0);
@@ -812,9 +812,9 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index,
 
 template <typename scalar_t>
 void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, const Tensor& self) {
-  int64_t* index_data = index.data_ptr<int64_t>();
+  const int64_t* index_data = index.const_data_ptr<int64_t>();
   scalar_t* result_data = result.data_ptr<scalar_t>();
-  scalar_t* self_data = self.data_ptr<scalar_t>();
+  const scalar_t* self_data = self.const_data_ptr<scalar_t>();
 
   const int64_t M = ensure_nonempty_size(result, 0);
   const int64_t N = ensure_nonempty_size(self, 0);
@@ -832,7 +832,7 @@ void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index,
                   "index ", index,
                   " is out of bounds for dimension ", 0,
                   " with size ", index_upper_bound);
-      scalar_t* self_ptr = self_data + index * K;
+      const scalar_t* self_ptr = self_data + index * K;
       int64_t d = 0;
       for (; d < K - (K % Vec::size()); d += Vec::size()) {
         Vec out_vec = Vec::loadu(self_ptr + d);
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -216,7 +216,7 @@ static void topk_kernel(
     .declare_static_shape(sizes, /*squash_dims=*/dim)
     .add_output(values)
     .add_output(indices)
-    .add_input(self)
+    .add_const_input(self)
     .build();
 
   auto mode_values_stride = values.strides()[dim];
diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu
@@ -68,8 +68,8 @@ void cross_impl(const Tensor& result, const Tensor& x1, const Tensor& x2, int64_
 
   auto iter = TensorIteratorConfig()
       .add_output(result)
-      .add_input(x1)
-      .add_input(x2)
+      .add_const_input(x1)
+      .add_const_input(x2)
       .resize_outputs(false)
       .declare_static_shape(result.sizes(), /*squash_dims=*/dim)
       .build();
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp