kblaszczak-intel
diff --git a/‎aten/src/ATen/native/BatchLinearAlgebra.cpp
+5-5 b/‎aten/src/ATen/native/BatchLinearAlgebra.cpp
+5-5
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp
+1-1 b/‎aten/src/ATen/native/BinaryOps.cpp
+1-1
diff --git a/‎aten/src/ATen/native/Bucketization.cpp
+1-1 b/‎aten/src/ATen/native/Bucketization.cpp
+1-1
diff --git a/‎aten/src/ATen/native/CPUFallback.cpp
+1-1 b/‎aten/src/ATen/native/CPUFallback.cpp
+1-1
diff --git a/‎aten/src/ATen/native/ChanelShuffle.cpp
+1-1 b/‎aten/src/ATen/native/ChanelShuffle.cpp
+1-1
diff --git a/‎aten/src/ATen/native/CompositeRandomAccessorCommon.h
+1-1 b/‎aten/src/ATen/native/CompositeRandomAccessorCommon.h
+1-1
diff --git a/‎aten/src/ATen/native/Distance.cpp
+1-1 b/‎aten/src/ATen/native/Distance.cpp
+1-1
diff --git a/‎aten/src/ATen/native/Dropout.cpp
+1-1 b/‎aten/src/ATen/native/Dropout.cpp
+1-1
diff --git a/‎aten/src/ATen/native/GridSampler.cpp
+5-5 b/‎aten/src/ATen/native/GridSampler.cpp
+5-5
diff --git a/‎aten/src/ATen/native/Linear.cpp
+1-1 b/‎aten/src/ATen/native/Linear.cpp
+1-1
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp
+6-6 b/‎aten/src/ATen/native/LinearAlgebra.cpp
+6-6
diff --git a/‎aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+4-4 b/‎aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+4-4
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp
+4-4 b/‎aten/src/ATen/native/ReduceOps.cpp
+4-4
diff --git a/‎aten/src/ATen/native/SobolEngineOpsUtils.cpp
+1-1 b/‎aten/src/ATen/native/SobolEngineOpsUtils.cpp
+1-1
diff --git a/‎aten/src/ATen/native/TensorConversions.cpp
+2-2 b/‎aten/src/ATen/native/TensorConversions.cpp
+2-2
diff --git a/‎aten/src/ATen/native/TensorIteratorDynamicCasting.h
+1-1 b/‎aten/src/ATen/native/TensorIteratorDynamicCasting.h
+1-1
diff --git a/‎aten/src/ATen/native/TensorIteratorReduce.cpp
+1-1 b/‎aten/src/ATen/native/TensorIteratorReduce.cpp
+1-1
@@ -2772,7 +2772,7 @@ Tensor linalg_eigvalsh(const Tensor& A, c10::string_view uplo) {
 
 Tensor& linalg_eigvalsh_out(const Tensor& A, c10::string_view uplo, Tensor& L) {
   auto V = at::empty({0}, A.options());
-  at::_linalg_eigh_out(L, V, A, uplo, /*comptue_v=*/false);
+  at::_linalg_eigh_out(L, V, A, uplo, /*compute_v=*/false);
   return L;
 }
 
@@ -3153,7 +3153,7 @@ TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A,
   TORCH_CHECK(use_cusolver || !driver.has_value(),
     "torch.linalg.svd: keyword argument `driver=` is only supported on CUDA inputs with cuSOLVER backend.");
 
-  // A always needs to be copied as its contents will be destroyed during the computaton of the SVD
+  // A always needs to be copied as its contents will be destroyed during the computation of the SVD
   // Now, MAGMA needs the copy to be on CPU, while cuSOLVER needs it to be on CUDA, so we'll defer
   // the copy as a column major matrix to the backends.
   const auto info = at::zeros(IntArrayRef(A.sizes().begin(), A.sizes().end() - 2), A.options().dtype(kInt));
@@ -3202,7 +3202,7 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional<c10::string_view> driv
   // Dummies
   auto U = at::empty({0}, A.options());
   auto Vh = at::empty({0}, A.options());
-  at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*comptue_uv=*/false, /*driver=*/driver);
+  at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*compute_uv=*/false, /*driver=*/driver);
   return S;
 }
 
@@ -3900,7 +3900,7 @@ Tensor& linalg_solve_triangular_out(
   }
 
   // No need to conjugate anything if out_f is conj as AX = conj(B) <=> conj(A)conj(X) = B
-  // and X = B after the algortihm. We just anotate that A is conjugated later on
+  // and X = B after the algorithm. We just annotate that A is conjugated later on
   // The solution will be written into out_f, so it'll be conjugated already
 
   Tensor A_f = std::move(A_);  // The A that will go into fortran
@@ -3909,7 +3909,7 @@ Tensor& linalg_solve_triangular_out(
   bool A_is_neg = A_f.is_neg() != out_f.is_neg();
   bool A_is_f_contig = (A_f.stride(-1) == 1) == transpose_A;
   if C10_UNLIKELY (!is_row_or_column_contiguous(A_f)) {
-    // We first anotate with flags on A_f all the conj / transpose / neg coming from out
+    // We first annotate with flags on A_f all the conj / transpose / neg coming from out
     // and then we clone the resulting tensor to resolve all of them in memory
     if (out_f.is_conj()) {
       A_f = A_f.conj();
 
@@ -1416,7 +1416,7 @@ Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
 }
 
 // We need explicit cast to OutFunc because each *_out func is overloaded twice. Without An explicit cast, merely
-// referring to *_out function is ambiguious.
+// referring to *_out function is ambiguous.
 using OutFunc = std::add_const<Tensor&(&)(Tensor&, const Tensor&, const Tensor&)>::type;
 
 // less, alias for torch.lt
 
@@ -162,7 +162,7 @@ Tensor& searchsorted_out_cpu(
     return result;
   }
 
-  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor
+  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor
   Tensor out = result;
   if (!result.is_contiguous()) {
     out = result.contiguous();
 
@@ -167,7 +167,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool
   // the temporary CPU output tensor that we created.
   //
   // Note [CPU Fallback Does Not Handle View Operators]
-  // Also note that we are incapable of handling immutable alises properly.
+  // Also note that we are incapable of handling immutable aliases properly.
   // Why?
   // Schemas with an immutable alias'd tensor outputs correspond to view operators.
   // For example, the `view_as` schema from native_functions.yaml:
 
@@ -69,7 +69,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
   // It is not clear, however from initial looking around it feels that
   // this may not be correct.
   // In this case channels last will likely require custom implementation
-  // if we want to preseve the memory order.
+  // if we want to preserve the memory order.
   // XNNPACK has channel shuffle op for NHWC. For mobile usecase this is good.
   // For server we will have to do a custom implementation.
   // For ChannelsFirst, a.k.a Contiguous, memory format we will also need
 
@@ -118,7 +118,7 @@ class CompositeRandomAccessor {
   using value_type = composite_value_type;
   using reference = references_holder<composite_value_type, composite_reference>;
   // Note that CompositeRandomAccessor does not hold key and values
-  // in a specific datastrcture, which means that a pointer to a (key, value)
+  // in a specific datastructure, which means that a pointer to a (key, value)
   // is not defined. Hence we just use a pointer type of the KeyAccessor.
   using pointer = typename std::iterator_traits<KeyAccessor>::pointer;
   using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
 
@@ -316,7 +316,7 @@ Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, doub
   // We want to divide each tensor by its norm first, as it's more numerically stable.
   // This keeps the result between -1.0 and 1.0
   // We clone them, as we're going to modify them in-place
-  // This allows the gradients to propagate propertly all the way to x1 and x2
+  // This allows the gradients to propagate properly all the way to x1 and x2
   auto x1_norm = at::linalg_vector_norm(*x1, 2, /*dim=*/dim, /*keepdim=*/true).clone();
   auto x2_norm = at::linalg_vector_norm(*x2, 2, /*dim=*/dim, /*keepdim=*/true).clone();
 
 
@@ -99,7 +99,7 @@ ALIAS_SPECIALIZATION(_feature_dropout,       true,  false)
 ALIAS_SPECIALIZATION(_alpha_dropout,         false, true )
 ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 
-} // anomymous namepsace
+} // anonymous namespace
 
 std::tuple<Tensor,Tensor>
 native_dropout_cpu(const Tensor& input, double p, c10::optional<bool> train) {
 
@@ -181,7 +181,7 @@ namespace {
                 int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
                 int64_t iz_nearest = static_cast<int64_t>(std::nearbyint(iz));
 
-                // assign nearest neighor pixel value to output pixel
+                // assign nearest neighbour pixel value to output pixel
                 scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
                 scalar_t *inp_ptr_NC = inp_ptr_N;
                 for (int64_t c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
@@ -422,7 +422,7 @@ namespace {
                 int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
                 int64_t iz_nearest = static_cast<int64_t>(std::nearbyint(iz));
 
-                // assign nearest neighor pixel value to output pixel
+                // assign nearest neighbour pixel value to output pixel
                 scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
                 if (input_requires_grad) {
                   scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
@@ -652,7 +652,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
             int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
             int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
 
-            // assign nearest neighor pixel value to output pixel
+            // assign nearest neighbour pixel value to output pixel
             scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
             scalar_t *inp_ptr_NC = inp_ptr_N;
             for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
@@ -682,7 +682,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid,
               // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
               scalar_t coefficients[4];
 
-              // Interpolate 4 values in the x directon
+              // Interpolate 4 values in the x direction
               for (const auto i : c10::irange(4)) {
                 coefficients[i] = cubic_interp1d<scalar_t>(
                   get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
@@ -847,7 +847,7 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output,
             int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
             int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
 
-            // assign nearest neighor pixel value to output pixel
+            // assign nearest neighbour pixel value to output pixel
             scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
             scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
             for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
 
@@ -814,7 +814,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
       rsizes.emplace_back(t2.sym_size(i));
     }
   }
-  // permut and reshape for matrix multiplication
+  // permute and reshape for matrix multiplication
   t1 = t1.permute(p1).reshape_symint({size1, csize});
   t2 = t2.permute(p2).reshape_symint({csize, size2});
   // multiply and reshape to target size
 
@@ -1022,7 +1022,7 @@ Tensor multi_dot_impl(TensorList _tensors, c10::optional<Tensor> _out) {
 
     // If the last and last tensors have shapes (a, b) and (b, c) the
     // output has shape (a, c). If either the first or last tensor is 1D
-    // a and/or c dimensions will be implicitely size 1 and will be ommited
+    // a and/or c dimensions will be implicitly size 1 and will be omitted
     // from the output. e.g. for inputs (a, b) x (b) the output has shape (a,).
     at::native::resize_output(out, out_shape);
 
@@ -1809,7 +1809,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
      * vs. other threads, leading to undefined behavior.
      * Thus it is recommended to not use at::parallel_for where lambdas do
      * ops that go through dispatcher.
-     * For now we circument this by InferenceMode guard in order to unlock
+     * For now we circumvent this by InferenceMode guard in order to unlock
      * performance.
      * Longer term we probably want a separate API that explicitly calls out
      * the TLS that it propagates.
@@ -1946,7 +1946,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
   // The output gradient g of this operation would have shape [b, m, k]
   // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
   // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
-  // of shape [b, n, k] unnacessarily, which may cause a large memory footprint, and in the
+  // of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
   // worst case, an OOM
   bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
   if (t2_requires_grad && !has_out) {
@@ -2602,7 +2602,7 @@ Tensor compute_T18_scale_square(
   auto scs = section_values. template data_ptr<int64_t>();
   auto pts = &scs[section_numel];
 
-  // We now will do the matrix muplication in a batch, with above example:
+  // We now will do the matrix multiplication in a batch, with above example:
   // 1. Multiply all matrices by 0 (`mul_times[0]`) times, then do `slice`
   // to get the remain matrices by acc[1:] (`split_counts[0]`),
   // 2. Multiply remain matrices by 1 times and slice to acc[2:]
@@ -2761,7 +2761,7 @@ Tensor backward_analytic_function_of_a_matrix(
 } // end anon namespace
 
 // Computes the matrix exponential for a given batch of squared matrices.
-// The implementaion is based on:
+// The implementation is based on:
 //
 // Bader, P.; Blanes, S.; Casas, F.
 // Computing the Matrix Exponential with an Optimized Taylor Polynomial Approximation.
@@ -2812,7 +2812,7 @@ TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar
   // Reductions always use `std::abs` to compute the absolute value. In the backward of this
   // function, we need to locate the index that was selected as the largest value. To do so
   // we do self.abs() == result to locate the index of the largest element.
-  // Now, self.abs() may dispatch to a vectorized implementation which gives sliiightly different
+  // Now, self.abs() may dispatch to a vectorized implementation which gives slightly different
   // results to the std::abs(std::complex<T>) implementation.
   // As such, to be able to compute the correct index in the backward, we need to use self.abs()
   // both in the forward and in the backward
 
@@ -304,7 +304,7 @@ void slow_conv_transpose2d_out_cpu_template(
     at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
       // For each elt in batch, do:
       for (const auto elt : c10::irange(begin, end)) {
-        // Matrix mulitply per output:
+        // Matrix multiply per output:
         Tensor input_n = input_.select(0, elt);
         Tensor output_n = output.select(0, elt);
         Tensor columns_n = columns.select(0, elt);
@@ -501,7 +501,7 @@ static void slow_conv_transpose2d_backward_out_cpu_template(
 
         // For each elt in batch, do:
         for (const auto elt : c10::irange(batch_size)) {
-          // Matrix mulitply per sample:
+          // Matrix multiply per sample:
           grad_input_n = grad_input.select(0, elt);
           grad_output_n = grad_output.select(0, elt);
 
@@ -695,12 +695,12 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu(
 
         // For each elt in batch, do:
         for (const auto elt : c10::irange(batch_size)) {
-          // Matrix mulitply per output:
+          // Matrix multiply per output:
           grad_output_n = grad_output.select(0, elt);
 
           // Do Weight:
           if (grad_weight.defined()) {
-            // Matrix mulitply per output:
+            // Matrix multiply per output:
             input_n = input.select(0, elt);
 
             if (need_columns) {
 
@@ -173,7 +173,7 @@ static void check_result_is_bytebool(const char* name, const Tensor& self, const
 
 // Note [all, any : uint8 compatibility]:
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// For NumPy comptability, `all` and `any` return
+// For NumPy compatibility, `all` and `any` return
 // Tensor of dtype `bool`. However for compatibility reason,
 // for `uint8`, they return Tensor of same dtype `uint8`.
 // Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
@@ -510,7 +510,7 @@ static Tensor reversed_cumsum(const Tensor& w, int64_t dim) {
 Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, const Tensor& output) {
   /*
     We show here how to derive an O(n) gradient formula for
-    abitrary inputs. It follows via a basic application of the
+    arbitrary inputs. It follows via a basic application of the
     chain rule together with a number of observations for different
     cases. We assume that x is an n-dimensional vector and y = cumprod(x).
     In the actual implementation we will need to play a bit with masks
@@ -527,7 +527,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co
     The term dF / dy_j is just grad_output[j] (assuming again
     everything is one-dimensional).
 
-    The term (dy_j / dx_k) is easilly seen to be
+    The term (dy_j / dx_k) is easily seen to be
 
     if j >= k
       dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i
@@ -589,7 +589,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co
 
     dy_j / dx_z1 = prod(x[:z1]) * (grad_output[z1] + sum(grad_output[z1+1:z2] * cumprod(x[z1+1:z2])))
 
-    When the imputs are complex, this is map is holomorphic. As such, to compute
+    When the inputs are complex, this is map is holomorphic. As such, to compute
     its backwards is just the conjugate of the usual backwards. This simplifies to
     conjugating the input. We may also reuse the output as, since the map is holomorphic,
     cumprod(input.conj()) = cumprod(input).conj()
 
@@ -3,7 +3,7 @@
 #include <ATen/native/SobolEngineOpsUtils.h>
 
 /*
-The direction nubmers in this file  were generated using the
+The direction numbers in this file  were generated using the
 python script below (thius this assumes that the file
 https://web.maths.unsw.edu.au/~fkuo/sobol/new-joe-kuo-6.21201
 is present in the working directory). For additional details see [1].
 
@@ -339,7 +339,7 @@ Tensor _to_copy(
   }
   // See Note [Explicit nullopt MemoryFormat argument]
   // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to
-  // empty_affine_quantizd/_empty_per_channel_affine_quantized calls
+  // empty_affine_quantized/_empty_per_channel_affine_quantized calls
   // at::empty also does not work here because there is no proper at::empty support for quantized tensors
   // as it would return a quantized tensor with an UnknownQuantizer
   auto r = self.is_quantized() ? at::empty_like(self, memory_format)
@@ -653,7 +653,7 @@ Tensor sparse_compressed_to_dense(
   dense = dense.reshape(dense_reshaped_sizes);
 
   // Calculate batch, row and column indices for non-zeros in the
-  // sparse matrix, and use these to calculate correspoding indices
+  // sparse matrix, and use these to calculate corresponding indices
   // into the dense matrix reshaped as above.  Then, update dense
   // matrix by adding sparse matrix values into elements with indices
   // calculated this way.
 
@@ -7,7 +7,7 @@
 #include <ATen/native/TensorIterator.h>
 
 
-// This file includes utilties for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
+// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
 
 // dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
 // to the function that is being called.
 
@@ -59,7 +59,7 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
     auto shape = first_reduce.shape();
     auto strides = first_reduce.get_strides();
 
-    // Bump output ptr so each thread has its own ouput slice
+    // Bump output ptr so each thread has its own output slice
     auto base_ptrs = first_reduce.get_base_ptrs();
     base_ptrs[0] += buffer_stride * thread_num;
Original file line number	Diff line number	Diff line change
`@@ -1416,7 +1416,7 @@ Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {`
`1416`	`1416`	`}`
`1417`	`1417`
`1418`	`1418`	`// We need explicit cast to OutFunc because each *_out func is overloaded twice. Without An explicit cast, merely`
`1419`		`-// referring to *_out function is ambiguious.`
	`1419`	`+// referring to *_out function is ambiguous.`
`1420`	`1420`	`using OutFunc = std::add_const<Tensor&(&)(Tensor&, const Tensor&, const Tensor&)>::type;`
`1421`	`1421`
`1422`	`1422`	`// less, alias for torch.lt`
Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@ Tensor& searchsorted_out_cpu(`
`162`	`162`	`return result;`
`163`	`163`	`}`
`164`	`164`
`165`		`- // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor`
	`165`	`+ // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor`
`166`	`166`	`Tensor out = result;`
`167`	`167`	`if (!result.is_contiguous()) {`
`168`	`168`	`out = result.contiguous();`
Original file line number	Diff line number	Diff line change
`@@ -814,7 +814,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,`
`814`	`814`	`rsizes.emplace_back(t2.sym_size(i));`
`815`	`815`	`}`
`816`	`816`	`}`
`817`		`- // permut and reshape for matrix multiplication`
	`817`	`+ // permute and reshape for matrix multiplication`
`818`	`818`	`t1 = t1.permute(p1).reshape_symint({size1, csize});`
`819`	`819`	`t2 = t2.permute(p2).reshape_symint({csize, size2});`
`820`	`820`	`// multiply and reshape to target size`