kblaszczak-intel
diff --git a/‎BUILD.bazel
+1 b/‎BUILD.bazel
+1
diff --git a/‎aten/src/ATen/core/ivalue.cpp
+1-1 b/‎aten/src/ATen/core/ivalue.cpp
+1-1
diff --git a/‎aten/src/ATen/core/ivalue_inl.h
+1-1 b/‎aten/src/ATen/core/ivalue_inl.h
+1-1
diff --git a/‎aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+2-2 b/‎aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+2-2
diff --git a/‎aten/src/ATen/native/Activation.cpp
+2-3 b/‎aten/src/ATen/native/Activation.cpp
+2-3
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp
+3-3 b/‎aten/src/ATen/native/BinaryOps.cpp
+3-3
diff --git a/‎aten/src/ATen/native/Convolution.cpp
+2 b/‎aten/src/ATen/native/Convolution.cpp
+2
diff --git a/‎aten/src/ATen/native/Copy.cpp
+1 b/‎aten/src/ATen/native/Copy.cpp
+1
diff --git a/‎aten/src/ATen/native/LegacyBatching.cpp
+5 b/‎aten/src/ATen/native/LegacyBatching.cpp
+5
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp
+1-1 b/‎aten/src/ATen/native/LinearAlgebra.cpp
+1-1
diff --git a/‎aten/src/ATen/native/PackedSequence.cpp
+1-1 b/‎aten/src/ATen/native/PackedSequence.cpp
+1-1
diff --git a/‎aten/src/ATen/native/RNN.cpp
+1-1 b/‎aten/src/ATen/native/RNN.cpp
+1-1
diff --git a/‎aten/src/ATen/native/Resize.cpp
+1 b/‎aten/src/ATen/native/Resize.cpp
+1
diff --git a/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
+4-8 b/‎aten/src/ATen/native/TensorAdvancedIndexing.cpp
+4-8
diff --git a/‎aten/src/ATen/native/TensorConversions.cpp
+3-3 b/‎aten/src/ATen/native/TensorConversions.cpp
+3-3
diff --git a/‎aten/src/ATen/native/Unfold3d.cpp
+1 b/‎aten/src/ATen/native/Unfold3d.cpp
+1
diff --git a/‎aten/src/ATen/native/WeightNorm.cpp
+2 b/‎aten/src/ATen/native/WeightNorm.cpp
+2
diff --git a/‎aten/src/ATen/native/cpu/PowKernel.cpp
+2-2 b/‎aten/src/ATen/native/cpu/PowKernel.cpp
+2-2
diff --git a/‎aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+1-1 b/‎aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+1-1
diff --git a/‎aten/src/ATen/native/nested/NestedTensorFactories.cpp
+1-1 b/‎aten/src/ATen/native/nested/NestedTensorFactories.cpp
+1-1
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMatmul.cpp
-58 b/‎aten/src/ATen/native/nested/NestedTensorMatmul.cpp
-58
diff --git a/‎aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+1-1 b/‎aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+1-1
diff --git a/‎aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+1-1 b/‎aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+1-1
diff --git a/‎aten/src/ATen/native/quantized/QTensor.cpp
+1-1 b/‎aten/src/ATen/native/quantized/QTensor.cpp
+1-1
@@ -1598,6 +1598,7 @@ TORCH_COPTS = COMMON_COPTS + [
     "-fvisibility-inlines-hidden",
     "-fno-math-errno ",
     "-fno-trapping-math",
+    "-Wno-error=unused-function",
 ]
 
 torch_sources = {
 
@@ -763,7 +763,7 @@ IValueComparator getGreaterThanComparator(const IValue& v) {
   };
 }
 
-static std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
+std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
   out << v.qualifiedClassName() << "." << v.name();
   return out;
 }
 
@@ -1622,7 +1622,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 
   TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
-      const EnumHolder& v);
+      const ivalue::EnumHolder& v);
 
   TORCH_API const std::string qualifiedClassName() const;
 
 
@@ -405,7 +405,7 @@ static std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
   TORCH_INTERNAL_ASSERT(false);
 }
 
-Tensor bucketize_decomp_Tensor(
+static Tensor bucketize_decomp_Tensor(
     const Tensor& self,
     const Tensor& boundaries,
     bool out_int32,
@@ -415,7 +415,7 @@ Tensor bucketize_decomp_Tensor(
   return at::searchsorted(boundaries, self, out_int32, right, nullopt, nullopt);
 }
 
-Tensor bucketize_decomp_Scalar(
+static Tensor bucketize_decomp_Scalar(
     const Scalar& self,
     const Tensor& boundaries,
     bool out_int32,
 
@@ -374,8 +374,8 @@ TORCH_IMPL_FUNC(softshrink_backward_out) (
   shrink_backward_stub(device_type(), *this, lambd);
 }
 
-static bool use_mkldnn(const Tensor& input) {
 #if AT_MKLDNN_ENABLED()
+static bool use_mkldnn(const Tensor& input) {
   if (!at::globalContext().userEnabledMkldnn()) {
     return false;
   }
@@ -386,9 +386,8 @@ static bool use_mkldnn(const Tensor& input) {
     (input.device().is_cpu() &&
     (((input.scalar_type() == kBFloat16) && mkldnn_bf16_device_check()) ||
     (input.scalar_type() == kFloat))); // input is dense layout and bfloat16/float32
-#endif
-  return false;
 }
+#endif
 
 TORCH_IMPL_FUNC(gelu_out_cpu) (
   const Tensor& self, c10::string_view approximate, const Tensor& result
 
@@ -809,7 +809,7 @@ Tensor& arctan2_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return at::atan2_out(result, self, other);
 }
 
-Tensor& add_relu_impl(
+static Tensor& add_relu_impl(
     Tensor& result, const Tensor& self, const Tensor& other, const Scalar& alpha) {
   auto iter = TensorIterator::binary_op(result, self, other);
   Scalar min_val;
@@ -1003,7 +1003,7 @@ Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) {
   return self;
 }
 
-Device correct_out_device(const Tensor& self, const Tensor& other) {
+static Device correct_out_device(const Tensor& self, const Tensor& other) {
   if (self.device() == at::kCPU){
       return other.device();
   } else {
@@ -1049,7 +1049,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
   }
 }
 
-Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
   auto out_device = correct_out_device(self, other);
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);
 
@@ -771,6 +771,7 @@ static void check_input_same_type_as_parameters(
   check_input_same_type_as_parameters(input, weight, /*bias=*/ Tensor());
 }
 
+#if AT_MKLDNN_ENABLED()
 static void check_input_same_type_as_parameters(
     const Tensor& input,
     const Tensor& weight,
@@ -789,6 +790,7 @@ static void check_input_same_type_as_parameters(
     check_input_same_type_as_parameters(input, weight, bias);
   }
 }
+#endif
 
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
   TORCH_CHECK(tensor.ndimension() == 3,
 
@@ -21,6 +21,7 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_copy_from.h>
+#include <ATen/ops/_propagate_xla_data.h>
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/expand_copy.h>
 
@@ -3,6 +3,11 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/LegacyVmapTransforms.h>
 
+#ifdef AT_PER_OPERATOR_HEADERS
+#include <ATen/ops/_add_batch_dim_native.h>
+#include <ATen/ops/_remove_batch_dim_native.h>
+#endif
+
 namespace at { namespace native {
 
 // Adds a batch dimension to the tensor `self` out-of-place
 
@@ -1893,7 +1893,7 @@ The behavior depends on the dimensionality of the Tensors as follows:
 - Otherwise, we return bmm, after broadcasting and folding the batched dimensions if
   there's more than one
 */
-Tensor _matmul_impl(
+static Tensor _matmul_impl(
     Tensor& out,
     const Tensor& tensor1,
     const Tensor& tensor2) {
 
@@ -20,7 +20,7 @@
 
 namespace at { namespace native {
 
-void checkLongTensor(const Tensor& tensor) {
+static void checkLongTensor(const Tensor& tensor) {
   TORCH_CHECK(tensor.dim() == 1 && tensor.device().type() == at::kCPU && tensor.scalar_type() == at::kLong,
            "'lengths' argument should be a 1D CPU int64 tensor, but got ",
             tensor.dim(), "D ", tensor.device().str(), " ", tensor.scalar_type(), " tensor");
 
@@ -1809,7 +1809,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
                          std::move(std::get<2>(results)));
 }
 
-std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
+static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
     const Tensor& data,
     const Tensor& batch_sizes,
     c10::List<at::Tensor> hx_,
 
@@ -11,6 +11,7 @@
 #include <ATen/ops/resize_as_native.h>
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/resize.h>
+#include <ATen/ops/_resize_output.h>
 #endif
 
 namespace at { namespace native {
 
@@ -400,7 +400,7 @@ static void build_index_op(
   iter.build(config);
 }
 
-void check_indices_on_cpu_or_selfdevice(
+static void check_indices_on_cpu_or_selfdevice(
     const Tensor& self,
     const at::MaterializedIOptTensorListRef& indices) {
   auto dev = self.device();
@@ -965,7 +965,7 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
   }
 }
 
-void index_reduce_func_impl(
+static void index_reduce_func_impl(
   const Tensor& self,
   int64_t dim,
   const Tensor& index,
@@ -1149,7 +1149,7 @@ static void check_indexarray_range(
   }
 }
 
-Tensor & index_select_out_cpu_dim1_(
+static Tensor & index_select_out_cpu_dim1_(
     Tensor & result_contig, const Tensor & self, const Tensor & index_contig) {
 
   auto self_contig = self.contiguous();
@@ -1379,10 +1379,6 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso
   return at::native::index_select_out_cpu_(self, dim, index, result);
 }
 
-Tensor index_select_backward(const Tensor& grad, at::IntArrayRef self_sizes, int64_t dim, const Tensor& index) {
-    return at::native::index_select_backward_symint(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index);
-}
-
 Tensor index_select_backward_symint(const Tensor& grad, c10::SymIntArrayRef self_sizes, int64_t dim, const Tensor& index) {
   // for composite compliance, use out-of-place variant of
   // `index_add` if index tensor is a Tensor Subclass.
@@ -1537,7 +1533,7 @@ static void scatter_reduce_exclude_self_helper(
   });
 }
 
-void _scatter_via_index_put(
+static void _scatter_via_index_put(
   const Tensor& self,
   int64_t dim,
   const Tensor& index,
 
@@ -1009,7 +1009,7 @@ Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optio
   return dense_to_sparse_compressed<Layout::SparseBsc>(self, blocksize, dense_dim_opt);
 }
 
-void _check_blocksize_matches(
+static void _check_blocksize_matches(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize_opt,
     const std::string& name) {
@@ -1023,7 +1023,7 @@ void _check_blocksize_matches(
   }
 }
 
-Tensor sparse_compressed_clone(
+static Tensor sparse_compressed_clone(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize,
     const std::string& name) {
@@ -1046,7 +1046,7 @@ Tensor sparse_compressed_clone(
       values.device());
 }
 
-Tensor sparse_compressed_to_flipped(
+static Tensor sparse_compressed_to_flipped(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize,
     const std::string& name) {
 
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/native/Unfold3d.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 
@@ -10,6 +10,8 @@
 #else
 #include <ATen/ops/_weight_norm_differentiable_backward_native.h>
 #include <ATen/ops/_weight_norm_interface.h>
+#include <ATen/ops/_weight_norm_interface_backward_native.h>
+#include <ATen/ops/_weight_norm_interface_native.h>
 #include <ATen/ops/_weight_norm_native.h>
 #include <ATen/ops/empty_strided.h>
 #include <ATen/ops/norm_except_dim.h>
 
@@ -13,7 +13,7 @@ namespace at::native {
 
 inline namespace CPU_CAPABILITY {
 
-void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
+static void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
   const auto dtype = iter.common_dtype();
   if (isFloatingType(dtype) || isComplexType(dtype)) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, dtype, "pow", [&]() {
@@ -90,7 +90,7 @@ void reciprocal_kernel(TensorIteratorBase& iter);
 void rsqrt_kernel(TensorIteratorBase& iter);
 void sqrt_kernel(TensorIteratorBase& iter);
 
-void pow_tensor_scalar_kernel(
+static void pow_tensor_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& exp_scalar) {
   // prevent multiple calls to iter.common_dtype()
 
@@ -32,6 +32,7 @@ namespace mkl {
 
 namespace {
 
+#if AT_USE_MKL_SPARSE()
 c10::MaybeOwned<Tensor> prepare_dense_matrix_for_mkl(
     const Tensor& tensor) {
   if (tensor.is_non_overlapping_and_dense() ||
@@ -110,7 +111,6 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
 /*
   Resizes `input` tensor and fills it with the data from MKL.
 */
-#if AT_USE_MKL_SPARSE()
 template <typename scalar_t>
 void mkl_result_copy_(const Tensor& input, sparse_matrix_t mkl_desc) {
   sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
 
@@ -6,7 +6,7 @@
 namespace at {
 namespace native {
 
-TensorOptions verify_empty_parameters(
+static TensorOptions verify_empty_parameters(
     const at::Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,
 
@@ -79,64 +79,6 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
   return output;
 }
 
-// utilities support `matmul_nested`
-namespace {
-// Args:
-//     self_sizes: the sizes of `self` in `matmul_nested`
-//     mat2_sizes: the sizes of `mat2` in `matmul_nested`
-//     buffer_op: the options for new buffer
-//     sizemat_op: the options for new size matrix
-// Returns:
-//     the batch size of each input underlying tensor, i.e. the product of batch-dimension sizes
-//     the empty output nested tensor
-inline std::tuple<std::vector<int64_t>, Tensor>
-matmul_nested_helper(
-    const std::vector<IntArrayRef>& self_sizes,
-    const std::vector<IntArrayRef>& mat2_sizes,
-    const c10::TensorOptions& buffer_op,
-    const c10::TensorOptions& sizemat_op) {
-  int64_t ntensors = self_sizes.size(),
-      ndims = self_sizes[0].size();
-  std::vector<int64_t> batch_sizes(ntensors, 1);
-  Tensor sizemat = at::empty({ntensors, ndims}, sizemat_op);
-  int64_t* sizemat_ptr = sizemat.mutable_data_ptr<int64_t>();
-  int64_t numel = 0;
-  for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef& self_size = self_sizes[i],
-        & mat2_size = mat2_sizes[i];
-    int64_t& batch_size = batch_sizes[i];
-    // batch dimensions
-    for (int64_t j = 0; j < ndims - 2; j++) {
-      const int64_t& self_sizej = self_size[j],
-          & mat2_sizej = mat2_size[j];
-      TORCH_CHECK(
-          self_sizej == mat2_sizej,
-          "matmul: For nested tensors, no broadcasting is currently performed: ",
-          i, "-th nested matrices in batch at dimension ", j + 1,
-          " have mismatching sizes ", self_sizej, " and ", mat2_sizej);
-      sizemat_ptr[j] = self_sizej;
-      batch_size *= sizemat_ptr[j];
-    }
-    // matrix multiplication dimensions
-    const int64_t& self_size0 = self_size[ndims - 2], & self_size1 = self_size[ndims - 1],
-        & mat2_size0 = mat2_size[ndims - 2], & mat2_size1 = mat2_size[ndims - 1];
-    TORCH_CHECK(
-        self_size1 == mat2_size0,
-        "matmul: ",
-        i, "-th nested matrices in batch cannot be multiplied (",
-        self_size0, "x", self_size1, " and ",
-        mat2_size0, "x", mat2_size1, ")");
-    sizemat_ptr[ndims - 2] = self_size0;
-    sizemat_ptr[ndims - 1] = mat2_size1;
-    sizemat_ptr += ndims;
-    numel += batch_size * self_size0 * mat2_size1;
-  }
-  Tensor buffer = at::empty(numel, buffer_op);
-  Tensor output = wrap_buffer(buffer, sizemat);
-  return std::make_tuple(batch_sizes, output);
-}
-}
-
 Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   // Tensor self = self_.contiguous();
   // Tensor mat2 = mat2_.contiguous();
 
@@ -128,7 +128,7 @@ Tensor fake_quantize_per_channel_affine_cachemask_backward(
   return dY * mask;
 }
 
-Tensor _get_rounded_zero_point(
+static Tensor _get_rounded_zero_point(
     const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max) {
 
@@ -133,7 +133,7 @@ Tensor fake_quantize_per_tensor_affine_cachemask_backward(
   return dY * mask;
 }
 
-int64_t _get_zero_point_from_tensor(
+static int64_t _get_zero_point_from_tensor(
     const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max,
 
@@ -285,7 +285,7 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
   return std::make_tuple(q_params.scale, q_params.zero_point);
 }
 
-float calculate_quant_loss(
+static float calculate_quant_loss(
     const float* input,
     int numel,
     float xmin,
Original file line number	Diff line number	Diff line change
`@@ -1598,6 +1598,7 @@ TORCH_COPTS = COMMON_COPTS + [`
`1598`	`1598`	`"-fvisibility-inlines-hidden",`
`1599`	`1599`	`"-fno-math-errno ",`
`1600`	`1600`	`"-fno-trapping-math",`
	`1601`	`+ "-Wno-error=unused-function",`
`1601`	`1602`	`]`
`1602`	`1603`
`1603`	`1604`	`torch_sources = {`
Original file line number	Diff line number	Diff line change
`@@ -763,7 +763,7 @@ IValueComparator getGreaterThanComparator(const IValue& v) {`
`763`	`763`	`};`
`764`	`764`	`}`
`765`	`765`
`766`		`-static std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {`
	`766`	`+std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {`
`767`	`767`	`out << v.qualifiedClassName() << "." << v.name();`
`768`	`768`	`return out;`
`769`	`769`	`}`
Original file line number	Diff line number	Diff line change
`@@ -374,8 +374,8 @@ TORCH_IMPL_FUNC(softshrink_backward_out) (`
`374`	`374`	`shrink_backward_stub(device_type(), *this, lambd);`
`375`	`375`	`}`
`376`	`376`
`377`		`-static bool use_mkldnn(const Tensor& input) {`
`378`	`377`	`#if AT_MKLDNN_ENABLED()`
	`378`	`+static bool use_mkldnn(const Tensor& input) {`
`379`	`379`	`if (!at::globalContext().userEnabledMkldnn()) {`
`380`	`380`	`return false;`
`381`	`381`	`}`
`@@ -386,9 +386,8 @@ static bool use_mkldnn(const Tensor& input) {`
`386`	`386`	`(input.device().is_cpu() &&`
`387`	`387`	`(((input.scalar_type() == kBFloat16) && mkldnn_bf16_device_check()) \|\|`
`388`	`388`	`(input.scalar_type() == kFloat))); // input is dense layout and bfloat16/float32`
`389`		`-#endif`
`390`		`- return false;`
`391`	`389`	`}`
	`390`	`+#endif`
`392`	`391`
`393`	`392`	`TORCH_IMPL_FUNC(gelu_out_cpu) (`
`394`	`393`	`const Tensor& self, c10::string_view approximate, const Tensor& result`
Original file line number	Diff line number	Diff line change
`@@ -809,7 +809,7 @@ Tensor& arctan2_out(const Tensor& self, const Tensor& other, Tensor& result) {`
`809`	`809`	`return at::atan2_out(result, self, other);`
`810`	`810`	`}`
`811`	`811`
`812`		`-Tensor& add_relu_impl(`
	`812`	`+static Tensor& add_relu_impl(`
`813`	`813`	`Tensor& result, const Tensor& self, const Tensor& other, const Scalar& alpha) {`
`814`	`814`	`auto iter = TensorIterator::binary_op(result, self, other);`
`815`	`815`	`Scalar min_val;`
`@@ -1003,7 +1003,7 @@ Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) {`
`1003`	`1003`	`return self;`
`1004`	`1004`	`}`
`1005`	`1005`
`1006`		`-Device correct_out_device(const Tensor& self, const Tensor& other) {`
	`1006`	`+static Device correct_out_device(const Tensor& self, const Tensor& other) {`
`1007`	`1007`	`if (self.device() == at::kCPU){`
`1008`	`1008`	`return other.device();`
`1009`	`1009`	`} else {`
`@@ -1049,7 +1049,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {`
`1049`	`1049`	`}`
`1050`	`1050`	`}`
`1051`	`1051`
`1052`		`-Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {`
	`1052`	`+static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {`
`1053`	`1053`	`auto out_device = correct_out_device(self, other);`
`1054`	`1054`	`// hack to use the TensorIterator to get the correct broadcasting and type promotion logic`
`1055`	`1055`	`auto device_ = Device(DeviceType::Meta);`
Original file line number	Diff line number	Diff line change
`@@ -771,6 +771,7 @@ static void check_input_same_type_as_parameters(`
`771`	`771`	`check_input_same_type_as_parameters(input, weight, /bias=/ Tensor());`
`772`	`772`	`}`
`773`	`773`
	`774`	`+#if AT_MKLDNN_ENABLED()`
`774`	`775`	`static void check_input_same_type_as_parameters(`
`775`	`776`	`const Tensor& input,`
`776`	`777`	`const Tensor& weight,`
`@@ -789,6 +790,7 @@ static void check_input_same_type_as_parameters(`
`789`	`790`	`check_input_same_type_as_parameters(input, weight, bias);`
`790`	`791`	`}`
`791`	`792`	`}`
	`793`	`+#endif`
`792`	`794`
`793`	`795`	`static auto view4d(const at::Tensor& tensor) -> at::Tensor {`
`794`	`796`	`TORCH_CHECK(tensor.ndimension() == 3,`
Original file line number	Diff line number	Diff line change
`@@ -1809,7 +1809,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(`
`1809`	`1809`	`std::move(std::get<2>(results)));`
`1810`	`1810`	`}`
`1811`	`1811`
`1812`		`-std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(`
	`1812`	`+static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(`
`1813`	`1813`	`const Tensor& data,`
`1814`	`1814`	`const Tensor& batch_sizes,`
`1815`	`1815`	`c10::List<at::Tensor> hx_,`
Original file line number	Diff line number	Diff line change
`@@ -400,7 +400,7 @@ static void build_index_op(`
`400`	`400`	`iter.build(config);`
`401`	`401`	`}`
`402`	`402`
`403`		`-void check_indices_on_cpu_or_selfdevice(`
	`403`	`+static void check_indices_on_cpu_or_selfdevice(`
`404`	`404`	`const Tensor& self,`
`405`	`405`	`const at::MaterializedIOptTensorListRef& indices) {`
`406`	`406`	`auto dev = self.device();`
`@@ -965,7 +965,7 @@ TORCH_IMPL_FUNC(index_add_cpu_out)`
`965`	`965`	`}`
`966`	`966`	`}`
`967`	`967`
`968`		`-void index_reduce_func_impl(`
	`968`	`+static void index_reduce_func_impl(`
`969`	`969`	`const Tensor& self,`
`970`	`970`	`int64_t dim,`
`971`	`971`	`const Tensor& index,`
`@@ -1149,7 +1149,7 @@ static void check_indexarray_range(`
`1149`	`1149`	`}`
`1150`	`1150`	`}`
`1151`	`1151`
`1152`		`-Tensor & index_select_out_cpu_dim1_(`
	`1152`	`+static Tensor & index_select_out_cpu_dim1_(`
`1153`	`1153`	`Tensor & result_contig, const Tensor & self, const Tensor & index_contig) {`
`1154`	`1154`
`1155`	`1155`	`auto self_contig = self.contiguous();`
`@@ -1379,10 +1379,6 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso`
`1379`	`1379`	`return at::native::index_select_out_cpu_(self, dim, index, result);`
`1380`	`1380`	`}`
`1381`	`1381`
`1382`		`-Tensor index_select_backward(const Tensor& grad, at::IntArrayRef self_sizes, int64_t dim, const Tensor& index) {`
`1383`		`- return at::native::index_select_backward_symint(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index);`
`1384`		`-}`
`1385`		`-`
`1386`	`1382`	`Tensor index_select_backward_symint(const Tensor& grad, c10::SymIntArrayRef self_sizes, int64_t dim, const Tensor& index) {`
`1387`	`1383`	`// for composite compliance, use out-of-place variant of`
`1388`	`1384`	// `index_add` if index tensor is a Tensor Subclass.
`@@ -1537,7 +1533,7 @@ static void scatter_reduce_exclude_self_helper(`
`1537`	`1533`	`});`
`1538`	`1534`	`}`
`1539`	`1535`
`1540`		`-void _scatter_via_index_put(`
	`1536`	`+static void _scatter_via_index_put(`
`1541`	`1537`	`const Tensor& self,`
`1542`	`1538`	`int64_t dim,`
`1543`	`1539`	`const Tensor& index,`
Original file line number	Diff line number	Diff line change
`@@ -1009,7 +1009,7 @@ Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optio`
`1009`	`1009`	`return dense_to_sparse_compressed<Layout::SparseBsc>(self, blocksize, dense_dim_opt);`
`1010`	`1010`	`}`
`1011`	`1011`
`1012`		`-void _check_blocksize_matches(`
	`1012`	`+static void _check_blocksize_matches(`
`1013`	`1013`	`const Tensor& self,`
`1014`	`1014`	`c10::optional<IntArrayRef> blocksize_opt,`
`1015`	`1015`	`const std::string& name) {`
`@@ -1023,7 +1023,7 @@ void _check_blocksize_matches(`
`1023`	`1023`	`}`
`1024`	`1024`	`}`
`1025`	`1025`
`1026`		`-Tensor sparse_compressed_clone(`
	`1026`	`+static Tensor sparse_compressed_clone(`
`1027`	`1027`	`const Tensor& self,`
`1028`	`1028`	`c10::optional<IntArrayRef> blocksize,`
`1029`	`1029`	`const std::string& name) {`
`@@ -1046,7 +1046,7 @@ Tensor sparse_compressed_clone(`
`1046`	`1046`	`values.device());`
`1047`	`1047`	`}`
`1048`	`1048`
`1049`		`-Tensor sparse_compressed_to_flipped(`
	`1049`	`+static Tensor sparse_compressed_to_flipped(`
`1050`	`1050`	`const Tensor& self,`
`1051`	`1051`	`c10::optional<IntArrayRef> blocksize,`
`1052`	`1052`	`const std::string& name) {`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ Tensor fake_quantize_per_channel_affine_cachemask_backward(`
`128`	`128`	`return dY * mask;`
`129`	`129`	`}`
`130`	`130`
`131`		`-Tensor _get_rounded_zero_point(`
	`131`	`+static Tensor _get_rounded_zero_point(`
`132`	`132`	`const Tensor& zero_point,`
`133`	`133`	`int64_t quant_min,`
`134`	`134`	`int64_t quant_max) {`
Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ Tensor fake_quantize_per_tensor_affine_cachemask_backward(`
`133`	`133`	`return dY * mask;`
`134`	`134`	`}`
`135`	`135`
`136`		`-int64_t _get_zero_point_from_tensor(`
	`136`	`+static int64_t _get_zero_point_from_tensor(`
`137`	`137`	`const Tensor& zero_point,`
`138`	`138`	`int64_t quant_min,`
`139`	`139`	`int64_t quant_max,`