Add BRGEMM API versioning to be compatible with different oneDNN versions (pytorch#138184)

CaoE · pytorchmergebot · commit c92de3b5dfa3 · 2024-11-05T01:26:27.000Z
oneDNN v3.6 updated the ukernel APIs of `brgemm` and `brgemm_pack_B`. Considering the upgrade of oneDNN, ukernel API versioning is needed to be compatible with different oneDNN versions. Pull Request resolved: pytorch#138184 Approved by: https://github.com/jgong5, https://github.com/peterbell10
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
@@ -45,12 +45,21 @@ extern "C" void zaxpy_(int *n, void *a, const void *x, int *incx, void *y, int *
 #endif  // USE_FBGEMM
 
 #if AT_MKLDNN_ENABLED()
-#include <oneapi/dnnl/dnnl_version.h>
-#endif // oneDNN
-
-#define ONEDNN_UKERNEL_ENABLED (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=5)
+#include <ideep.hpp>
+// Add uKernel API versioning to be compatible with different oneDNN versions
+// oneDNN 3.6.x updates the ukernel APIs of brgemm and brgemm_pack_B
+// brgemm_pack_B is changed to transform and the setting of brgemm beta is changed to set_add_C
+#if (IDEEP_VERSION_MAJOR == 3 && IDEEP_VERSION_MINOR == 5)
+#define ONEDNN_UKERNEL_1
+#elif (IDEEP_VERSION_MAJOR >= 3 && IDEEP_VERSION_MINOR >= 6)
+#define ONEDNN_UKERNEL_2
+#endif
+#if ((defined(ONEDNN_UKERNEL_1) || defined(ONEDNN_UKERNEL_2)) && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))))
+#define ONEDNN_UKERNEL_ENABLED
+#endif
+#endif  // AT_MKLDNN_ENABLED()
 
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
 #include <oneapi/dnnl/dnnl_ukernel.hpp>
 #include <oneapi/dnnl/dnnl.hpp>
 #endif // oneDNN BRGEMM
@@ -847,7 +856,7 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
 }
 
 // oneDNN BRGEMM
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
 struct BrgemmKey {
   int64_t M;
   int64_t N;
@@ -859,8 +868,8 @@ struct BrgemmKey {
   ScalarType dt_a;
   ScalarType dt_b;
   ScalarType dt_c;
-  float alpha;
-  float beta;
+  bool add_C;
+
   BrgemmKey(
       int64_t M,
       int64_t N,
@@ -872,8 +881,7 @@ struct BrgemmKey {
       ScalarType dt_a,
       ScalarType dt_b,
       ScalarType dt_c,
-      float alpha,
-      float beta)
+      bool add_C)
       : M(M),
         N(N),
         K(K),
@@ -884,14 +892,12 @@ struct BrgemmKey {
         dt_a(dt_a),
         dt_b(dt_b),
         dt_c(dt_c),
-        alpha(alpha),
-        beta(beta) {}
+        add_C(add_C) {}
   bool operator==(const BrgemmKey& other) const {
     return M == other.M && N == other.N && K == other.K &&
         batch_size == other.batch_size && lda == other.lda &&
         ldb == other.ldb && ldc == other.ldc && dt_a == other.dt_a &&
-        dt_b == other.dt_b && dt_c == other.dt_c && alpha == other.alpha &&
-        beta == other.beta;
+        dt_b == other.dt_b && dt_c == other.dt_c && add_C == other.add_C;
   }
 };
 
@@ -945,13 +951,13 @@ struct UnsafeUkernelKeyHasher {
 
 template<>
 std::size_t UnsafeUkernelKeyHasher<BrgemmKey>::operator()(const BrgemmKey& key) const {
-  // Use beta, M, N, and K to compute hash to reduce the overhead as
-  // batch size, alpha, and data types are unlikely to change within the same kernel and
-  // leading dimensions are likely to be related to M, K, N or use fixed values.
-  std::size_t h = std::hash<float>()(key.beta + 1);
-  h = std::hash<int64_t>()(key.M) ^ (h << 1);
+  // Use M, N, K add_C, and ldc to compute hash to reduce the overhead as
+  // batch size and data types are unlikely to change within the same kernel and
+  // lda/ldb are likely to be related to M, K, N or use fixed values.
+  std::size_t h = std::hash<int64_t>()(key.M);
   h = std::hash<int64_t>()(key.N) ^ (h << 1);
   h = std::hash<int64_t>()(key.K) ^ (h << 1);
+  h = std::hash<bool>()(key.add_C) ^ (h << 1);
   h = std::hash<int64_t>()(key.ldc) ^ (h << 1);
   return h;
 }
@@ -1000,9 +1006,9 @@ struct GemmHelper {
       ScalarType dt_a,
       ScalarType dt_b,
       ScalarType dt_c,
-      const float alpha,
-      const float beta) {
+      const bool add_C) {
     // Create brgemm
+#if defined(ONEDNN_UKERNEL_1)
     brg = dnnl::ukernel::brgemm(
         M,
         N,
@@ -1014,8 +1020,23 @@ struct GemmHelper {
         get_dnnl_dtype(dt_a),
         get_dnnl_dtype(dt_b),
         get_dnnl_dtype(dt_c),
-        alpha,
-        beta);
+        1,
+        add_C ? 1 : 0);
+#elif defined(ONEDNN_UKERNEL_2)
+    brg = dnnl::ukernel::brgemm(
+        M,
+        N,
+        K,
+        bs,
+        ld_a,
+        ld_b,
+        ld_c,
+        get_dnnl_dtype(dt_a),
+        get_dnnl_dtype(dt_b),
+        get_dnnl_dtype(dt_c));
+    brg.set_add_C(add_C);
+    brg.finalize();
+#endif
     // Create a scratchpad buffer for the brgemm execution
     scratchpad = std::vector<uint8_t>(brg.get_scratchpad_size());
     // Prepare default vector of pairs of tensors A and B offsets for each batch.
@@ -1037,8 +1058,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
       int64_t ld_a,
       int64_t ld_b,
       int64_t ld_c,
-      const float alpha,
-      const float beta,
+      const bool add_C,
       const scalar_t_a* A,
       const scalar_t_b* B,
       scalar_t_c* C) {
@@ -1053,8 +1073,7 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
         c10::CppTypeToScalarType<scalar_t_a>::value,
         c10::CppTypeToScalarType<scalar_t_b>::value,
         c10::CppTypeToScalarType<scalar_t_c>::value,
-        alpha,
-        beta);
+        add_C);
     // Fetch/create GemmHelper object
     auto&& value = fetch_or_create(key, [&]() {
       auto&& v = std::make_shared<GemmHelper>(
@@ -1068,13 +1087,14 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
           c10::CppTypeToScalarType<scalar_t_a>::value,
           c10::CppTypeToScalarType<scalar_t_b>::value,
           c10::CppTypeToScalarType<scalar_t_c>::value,
-          alpha,
-          beta);
+          add_C);
       (*v).brg.generate();
       return std::move(v);
     });
     if (get_current() != value) {
+#if defined(ONEDNN_UKERNEL_1)
       dnnl::ukernel::brgemm::release_hw_context();
+#endif
       ((*value).brg).set_hw_context();
       get_current() = value;
     }
@@ -1099,7 +1119,11 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
   }
 };
 
+#if defined(ONEDNN_UKERNEL_1)
 using pack_t = dnnl::ukernel::brgemm_pack_B;
+#elif defined(ONEDNN_UKERNEL_2)
+using pack_t = dnnl::ukernel::transform;
+#endif
 struct Pack : public KernelCache <PackKey, pack_t> {
   static inline void call(
       int64_t K,
@@ -1113,7 +1137,11 @@ struct Pack : public KernelCache <PackKey, pack_t> {
     auto&& key = PackKey(K, N, ld_in, ld_out, dt_in, dt_out);
     auto&& pack = fetch_or_create(key, [&]() {
       auto&& p = std::make_shared<pack_t>(
+#if defined(ONEDNN_UKERNEL_1)
           K, N, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
+#elif defined(ONEDNN_UKERNEL_2)
+          K, N, dnnl::ukernel::pack_type::no_trans, ld_in, ld_out, get_dnnl_dtype(dt_in), get_dnnl_dtype(dt_out));
+#endif
       if (need_pack(dt_in)) {
         (*p).generate();
       }
@@ -1146,15 +1174,14 @@ void brgemm(
     int64_t ld_a,
     int64_t ld_b,
     int64_t ld_c,
-    const float alpha,
-    const float beta,
+    const bool add_C,
     const at::Half* A,
     const at::Half* B,
     float* C) {
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
   if (Brgemm::device_check(ScalarType::Half)) {
     Brgemm::call<at::Half, at::Half, float>(
-      M, N, K, ld_a, ld_b, ld_c, alpha, beta, A, B, C);
+      M, N, K, ld_a, ld_b, ld_c, add_C, A, B, C);
     return;
   }
 #endif
@@ -1163,8 +1190,9 @@ void brgemm(
 }
 
 void brgemm_release() {
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
   dnnl::ukernel::brgemm::release_hw_context();
+  Brgemm::get_current() = nullptr;
 #endif
 }
 
@@ -1177,15 +1205,15 @@ void pack(
     ScalarType dt_out,
     const void* in,
     void* out) {
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
   Pack::call(K, N, ld_in, ld_out, dt_in, dt_out, in, out);
 #else
   TORCH_CHECK(false, "pack is only supported on X64 with oneDNN ukernel enabled");
 #endif
 }
 
 bool need_pack(ScalarType dt_in) {
-#if ONEDNN_UKERNEL_ENABLED && (defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)))
+#if defined(ONEDNN_UKERNEL_ENABLED)
   return Pack::need_pack(dt_in);
 #else
   return false;
diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h
@@ -189,7 +189,7 @@ void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<fl
 
 // Batch-reduce GEMM
 // Operates by the following formula:
-// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
+// C = SUM(A[i] x B[i]) + C if add_C is true, i = 0 to batch size
 // A Base pointer to a tensor A.
 // B Base pointer to a tensor B.
 // C Pointer to a tensor C (accumulation buffer).
@@ -200,8 +200,7 @@ TORCH_API void brgemm(
     int64_t ld_a,
     int64_t ld_b,
     int64_t ld_c,
-    const float alpha,
-    const float beta,
+    const bool add_C,
     const at::Half* A,
     const at::Half* B,
     float* C);
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -603,8 +603,7 @@ void cpu_flash_attention(
                   headSize_even ? qStrideM : eheadSize,
                   packb_size,
                   rkvBlockSize,
-                  1.f,
-                  0.f,
+                  false,
                   !headSize_even
                       ? query_t_padding_ptr
                       : q_data + i * qStrideB + j * qStrideH + m * qStrideM,
@@ -738,8 +737,7 @@ void cpu_flash_attention(
                   ekvBlockSize,
                   packb_size,
                   rHeadSize,
-                  1.0,
-                  n == 0 ? 0.f : 1.f,
+                  n > 0,
                   qk_reduced_data,
                   value_reorder_ptr +
                       i * num_head * kv_padding_size * rHeadSize +
@@ -791,10 +789,10 @@ void cpu_flash_attention(
       // Move to the next query
       data_index_step(i, batchSize, j, num_head, k, qSlice);
     }
+    if (need_pack) {
+      cpublas::brgemm_release();
+    }
   });
-  if (need_pack) {
-    cpublas::brgemm_release();
-  }
 }
 
 template <typename scalar_t, typename mask_t, int64_t q_split_size, int64_t kv_split_size>