kblaszczak-intel
diff --git a/‎CMakeLists.txt
+1-12 b/‎CMakeLists.txt
+1-12
diff --git a/‎aten/src/ATen/CMakeLists.txt
+3-34 b/‎aten/src/ATen/CMakeLists.txt
+3-34
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
+1-10 b/‎aten/src/ATen/native/transformers/attention.cpp
+1-10
diff --git a/‎aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-27 b/‎aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-27
@@ -735,21 +735,10 @@ endif()
 include(cmake/Dependencies.cmake)
 
 # Moved this cmake set option down here because CMAKE_CUDA_COMPILER_VERSION is not avaialble until now
-# TODO: Merge this into cmake_dependent_option as "NOT MSVC AND (USE_CUDA OR USE_ROCM)"
-#       once cmake_minimum_required is bumped to 3.22
-#       See https://cmake.org/cmake/help/latest/policy/CMP0127.html for the feature required here.
-if(MSVC)
-  set(CONFIG_FA OFF)
-elseif(USE_ROCM OR USE_CUDA)
-  set(CONFIG_FA ON)
-else()
-  set(CONFIG_FA OFF)
-endif()
-
 cmake_dependent_option(
   USE_FLASH_ATTENTION
   "Whether to build the flash_attention kernel for scaled dot product attention" ON
-  "CONFIG_FA" OFF)
+  "USE_CUDA AND NOT ROCM AND NOT MSVC AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
 
 # Flash Attention2 will error while building for sm52 while Mem Eff Attention won't
 cmake_dependent_option(
 
@@ -164,10 +164,6 @@ file(GLOB flash_attention_cuda_cu "native/transformers/cuda/flash_attn/*.cu")
 file(GLOB flash_attention_cuda_kernels_cu "native/transformers/cuda/flash_attn/kernels/*.cu")
 file(GLOB flash_attention_cuda_cpp "native/transformers/cuda/flash_attn/*.cpp")
 
-# flash_attention sources
-file(GLOB flash_attention_hip_hip "native/transformers/hip/flash_attn/*.hip")
-file(GLOB flash_attention_src_hip_hip "native/transformers/hip/flash_attn/src/*.hip")
-
 #Mem_eff attention sources
 file(GLOB mem_eff_attention_cuda_cu "native/transformers/cuda/mem_eff_attention/*.cu")
 file(GLOB mem_eff_attention_cuda_kernels_cu "native/transformers/cuda/mem_eff_attention/kernels/*.cu")
@@ -179,9 +175,6 @@ if(USE_FLASH_ATTENTION)
   list(APPEND native_transformers_cuda_cpp ${flash_attention_cuda_cpp})
   list(APPEND FLASH_ATTENTION_CUDA_SOURCES ${flash_attention_cuda_cu} ${flash_attention_cuda_kernels_cu})
   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${flash_attention_cuda_kernels_cu})
-
-  list(APPEND native_transformers_hip_hip ${flash_attention_hip_hip})
-  list(APPEND native_transformers_src_hip_hip ${flash_attention_src_hip_hip})
 endif()
 
 if(USE_MEM_EFF_ATTENTION)
@@ -291,34 +284,10 @@ endif()
 
 if(USE_ROCM)
   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
-  list(APPEND ATen_HIP_SRCS
-    ${ATen_HIP_SRCS}
-    ${hip_hip}
-    ${native_hip_hip}
-    ${native_nested_hip_hip}
-    ${native_sparse_hip_hip}
-    ${native_quantized_hip_hip}
-    ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
-  )
+  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_nested_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip} ${native_transformers_hip_hip})
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-  list(APPEND all_hip_cpp
-    ${native_nested_hip_cpp}
-    ${native_sparse_hip_cpp}
-    ${native_quantized_hip_cpp}
-    ${native_transformers_hip_cpp}
-    ${native_quantized_cudnn_hip_cpp}
-    ${hip_cpp}
-    ${native_hip_cpp}
-    ${native_hip_linalg_cpp}
-    ${cuda_generated_sources}
-    ${ATen_HIP_SRCS}
-    ${native_miopen_cpp}
-    ${native_cudnn_hip_cpp}
-    ${miopen_cpp}
-    ${all_hip_cpp}
-  )
+  set(all_hip_cpp ${native_nested_hip_cpp} ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${native_transformers_hip_cpp} ${native_quantized_cudnn_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS})
+  set(all_hip_cpp ${native_miopen_cpp} ${native_cudnn_hip_cpp} ${miopen_cpp} ${all_hip_cpp})
 endif()
 
 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)
 
@@ -445,13 +445,6 @@ int64_t _fused_sdp_choice_meta(
     bool is_causal,
     c10::optional<double> scale) {
   auto query_key_set = query_.key_set();
-#if defined(USE_ROCM)
-  bool has_rocm = query_key_set.has(c10::DispatchKey::HIP);
-  if (has_rocm) {
-    auto choice_int = _fused_sdp_choice_stub(at::kHIP, query_, key, value, attn_mask_, dropout_p, is_causal, scale);
-    return choice_int;
-  }
-#else
   bool has_cuda = query_key_set.has(c10::DispatchKey::CUDA);
   if (has_cuda) {
     auto choice_int = _fused_sdp_choice_stub(
@@ -465,7 +458,6 @@ int64_t _fused_sdp_choice_meta(
         scale);
     return choice_int;
   }
-#endif
   return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 namespace {
@@ -633,8 +625,7 @@ Tensor scaled_dot_product_attention(
   validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_causal, scale);
   int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
   if (query_.device().type() == DeviceType::CUDA
-      || query_.device().type() == DeviceType::CPU
-      || query_.device().type() == DeviceType::HIP){
+      || query_.device().type() == DeviceType::CPU){
     choice_int = _fused_sdp_choice_stub(query_.device().type(),
       query_, key, value, attn_mask_, dropout_p, is_causal, scale);
   }
 
@@ -14,7 +14,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
-#include <c10/util/CallOnce.h>
 
 #include <c10/core/SymInt.h>
 #include <c10/util/string_view.h>
@@ -182,31 +181,6 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
   using sm80 = SMVersion<8, 0>;
   using sm90 = SMVersion<9, 0>;
   auto dprops = at::cuda::getCurrentDeviceProperties();
-#if USE_ROCM
-  constexpr std::string_view mi200 = "gfx90a:sramecc+:xnack-";
-  static const char *over_arch = [] {
-    auto rc = std::getenv("PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE");
-    if (rc) {
-        TORCH_WARN("SDPA functions only loads value from PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE once. "
-                   "Later changes to this environment variable with os.environ "
-                   "(or other methods) will not affect SDPA function's behavior.");
-    }
-    return rc;
-  }();
-  const char* real_arch = dprops->gcnArchName;
-  const char* arch = over_arch ? over_arch : real_arch;
-  if (mi200 != arch) {
-    if (debug) {
-      TORCH_WARN(
-          "Flash attention only supports gpu architecture gfx90a, for now. Attempting to run on a ",
-          arch,
-          ".",
-          over_arch ? " This is overrided by PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE. Real architecture is " : "",
-          over_arch ? real_arch : "");
-    }
-    return false;
-  }
-#else
   if (!check_sm_version<sm80, sm90>(dprops)) {
     if (debug) {
       TORCH_WARN(
@@ -218,7 +192,6 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
     }
     return false;
   }
-#endif
   return true;
 }