yiliu30
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_double.h
+4 b/‎aten/src/ATen/cpu/vec/vec256/vec256_double.h
+4
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_float.h
+6 b/‎aten/src/ATen/cpu/vec/vec256/vec256_float.h
+6
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+10 b/‎aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+10
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+13 b/‎aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+13
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+14 b/‎aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+14
diff --git a/‎aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+14 b/‎aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+14
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_double.h
+4 b/‎aten/src/ATen/cpu/vec/vec512/vec512_double.h
+4
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_float.h
+4 b/‎aten/src/ATen/cpu/vec/vec512/vec512_float.h
+4
diff --git a/‎aten/src/ATen/cpu/vec/vec_base.h
+8 b/‎aten/src/ATen/cpu/vec/vec_base.h
+8
diff --git a/‎aten/src/ATen/native/AmpKernels.cpp
+41 b/‎aten/src/ATen/native/AmpKernels.cpp
+41
diff --git a/‎aten/src/ATen/native/AmpKernels.h
+28 b/‎aten/src/ATen/native/AmpKernels.h
+28
@@ -100,6 +100,10 @@ template <> class Vectorized<double> {
   Vectorized<double> isnan() const {
     return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
   }
+  bool has_inf_nan() const {
+    __m256d self_sub  = _mm256_sub_pd(values, values);
+    return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0;
+  }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
 
@@ -106,6 +106,12 @@ template <> class Vectorized<float> {
   Vectorized<float> isnan() const {
     return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
   }
+
+  bool has_inf_nan() const {
+    __m256 self_sub  = _mm256_sub_ps(values, values);
+    return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0;
+  }
+
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
 
@@ -307,6 +307,16 @@ template <> class Vectorized<float> {
     }
     return loadu(res);
   };
+  bool has_inf_nan() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if(_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
 
@@ -383,6 +383,19 @@ class Vectorized<double> {
     auto ret = (x == x);
     return ret._nor();
   }
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
 
   DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
   DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
 
@@ -239,6 +239,20 @@ class Vectorized<float> {
     return (x == v_inf) | (x == v_minus_inf);
   }
 
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   int zero_mask() const {
     // returns an integer mask where all zero elements are translated to 1-bit
     // and others are translated to 0-bit
 
@@ -875,6 +875,20 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
     return ret._not();
   }
 
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   template <
       typename U = T,
       std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
 
@@ -106,6 +106,10 @@ template <> class Vectorized<double> {
     return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
                                                       0xFFFFFFFFFFFFFFFF));
   }
+  bool has_inf_nan() const {
+    __m512d self_sub  = _mm512_sub_pd(values, values);
+    return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
   Vectorized<double> map(double (*const f)(double)) const {
     __at_align__ double tmp[size()];
     store(tmp);
 
@@ -125,6 +125,10 @@ template <> class Vectorized<float> {
     return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
                                                       0xFFFFFFFF));
   }
+  bool has_inf_nan() const {
+    __m512 self_sub  = _mm512_sub_ps(values, values);
+    return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
   Vectorized<float> map(float (*const f)(float)) const {
     __at_align__ float tmp[size()];
     store(tmp);
 
@@ -255,6 +255,14 @@ struct Vectorized {
     }
     return vector;
   }
+  bool has_inf_nan() const {
+    for (int64_t i = 0; i != size(); i++) {
+      if(_isnan(values[i]) || _isinf(values[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
   Vectorized<T> map(T (*const f)(T)) const {
     Vectorized<T> ret;
     for (int64_t i = 0; i != size(); i++) {
 
@@ -0,0 +1,41 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/AmpKernels.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_native.h>
+#include <ATen/ops/_amp_update_scale.h>
+#include <ATen/ops/_amp_update_scale_native.h>
+#endif
+
+namespace at::native {
+
+void _amp_foreach_non_finite_check_and_unscale_cpu_(
+    TensorList scaled_grads,
+    at::Tensor& found_inf,
+    const at::Tensor& inv_scale) {
+    _amp_foreach_non_finite_check_and_unscale_cpu_stub(
+        found_inf.device().type(), scaled_grads, found_inf, inv_scale);
+}
+
+at::Tensor& _amp_update_scale_cpu_ (
+    at::Tensor& current_scale,
+    at::Tensor& growth_tracker,
+    const at::Tensor& found_inf,
+    double growth_factor,
+    double backoff_factor,
+    int64_t growth_interval) {
+    return _amp_update_scale_cpu_stub(
+        growth_tracker.device().type(), current_scale, growth_tracker,
+        found_inf, growth_factor, backoff_factor, growth_interval);
+}
+
+DEFINE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DEFINE_DISPATCH(_amp_update_scale_cpu_stub);
+
+} // namespace at::native
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)(
+    TensorList,
+    Tensor&,
+    const Tensor&);
+
+using _amp_update_scale_cpu__fn = Tensor& (*)(
+    Tensor&,
+    Tensor&,
+    const Tensor&,
+    double,
+    double,
+    int64_t);
+
+DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub);
+
+} // namespace native
+} // namespace at