ROCm · zoranjovanovic-ns · Dec 14, 2024 · Feb 10, 2025 · Feb 10, 2025 · Feb 12, 2025
diff --git a/third_party/llvm/capture.patch b/third_party/llvm/capture.patch
@@ -0,0 +1,11 @@
+--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
++++ a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+@@ -119,7 +119,7 @@
+
+ std::optional<SmallVector<int64_t>>
+ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+-  bool failed = false;
++  bool failed = false;__asm__("":"+r"(failed));
+   SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+     auto cv = getConstantIntValue(ofr);
+     if (!cv.has_value())
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
@@ -23,6 +23,7 @@ def repo(name):
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
             "//third_party/llvm:rocdl_shuffle_down.patch",
+            "//third_party/llvm:capture.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/third_party/tsl/third_party/llvm/capture.patch b/third_party/tsl/third_party/llvm/capture.patch
@@ -0,0 +1,11 @@
+--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
++++ a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+@@ -119,7 +119,7 @@
+
+ std::optional<SmallVector<int64_t>>
+ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+-  bool failed = false;
++  bool failed = false;__asm__("":"+r"(failed));
+   SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+     auto cv = getConstantIntValue(ofr);
+     if (!cv.has_value())
diff --git a/third_party/tsl/third_party/llvm/workspace.bzl b/third_party/tsl/third_party/llvm/workspace.bzl
@@ -23,6 +23,7 @@ def repo(name):
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
             "//third_party/llvm:rocdl_shuffle_down.patch",
+            "//third_party/llvm:capture.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -484,12 +484,89 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gemm_fusion_autotuner_cuda",
+    srcs = [
+        "gemm_fusion_autotuner.h",
+        "gemm_fusion_autotuner_cuda.cc",
+    ],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:algorithm_util",
+        "//xla/service:executable",
+        "//xla/service:shaped_buffer",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/transforms:cudnn_fusion_compiler",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:semantic_version",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@tsl//tsl/platform:env",
+    ],
+)
+
+cc_library(
+    name = "gemm_fusion_autotuner_rocm",
+    srcs = [
+        "gemm_fusion_autotuner.h",
+        "gemm_fusion_autotuner_rocm.cc",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:executable",
+        "//xla/service:shaped_buffer",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        #"//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/rocm:rocblas_plugin",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@tsl//tsl/platform:env",
+    ],
+)
+
 cc_library(
     name = "gemm_fusion_autotuner",
-    srcs = if_cuda_is_configured(["gemm_fusion_autotuner.cc"]),
-    hdrs = if_cuda_is_configured(["gemm_fusion_autotuner.h"]),
+    srcs = ["gemm_fusion_autotuner.cc"],
+    hdrs = ["gemm_fusion_autotuner.h"],
+    tags = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    deps = if_cuda_is_configured([
+    deps = if_cuda_is_configured([":gemm_fusion_autotuner_cuda"]) + if_rocm_is_configured([
+        ":gemm_fusion_autotuner_rocm",
+    ]) + [
         ":autotuner_compile_util",
         ":autotuner_util",
         ":backend_configs_cc",
@@ -552,15 +629,12 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "@tsl//tsl/platform:path",
-    ]),
+    ],
 )
 
 xla_test(
     name = "gemm_fusion_autotuner_test",
-    srcs = if_cuda_is_configured(["gemm_fusion_autotuner_test.cc"]),
-    backend_tags = {"gpu": [
-        "requires-gpu-sm80",
-    ]},
+    srcs = if_gpu_is_configured(["gemm_fusion_autotuner_test.cc"]),
     backends = [
         "gpu",
     ],
@@ -3803,6 +3877,7 @@ cc_library(
         ":cudnn_fused_conv_rewriter",
         ":cusolver_rewriter",
         ":gemm_algorithm_picker",
+        ":gemm_fusion_autotuner",
         ":gpu_algebraic_simplifier",
         ":gpu_compiler",
         ":gpu_conv_padding_legalization",

diff --git a/xla/service/gpu/amdgpu_compiler.cc b/xla/service/gpu/amdgpu_compiler.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
 #include "xla/service/gpu/cublas_pad_for_gemms.h"
+#include "xla/service/gpu/gemm_algorithm_picker.h"
+#include "xla/service/gpu/gemm_fusion_autotuner.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
 #include "xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
@@ -277,5 +279,14 @@ AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
   return BackendCompileResult{"", std::move(hsaco)};
 }
 
+absl::Status AMDGPUCompiler::AddGemmFusionAutotuningPasses(
+    HloPassPipeline* pipeline, HloModule* hlo_module,
+    AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+    const MultiProcessKeyValueStore& key_value_store) {
+  pipeline->AddPass<GemmFusionAutotuner>(autotune_config, GetToolkitVersion(),
+                                         thread_pool, key_value_store);
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/xla/service/gpu/amdgpu_compiler.h b/xla/service/gpu/amdgpu_compiler.h
@@ -66,6 +66,11 @@ class AMDGPUCompiler : public GpuCompiler {
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override;
 
+  absl::Status AddGemmFusionAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+      const MultiProcessKeyValueStore& key_value_store) override;
+
  private:
   AMDGPUCompiler(const AMDGPUCompiler&) = delete;
   AMDGPUCompiler& operator=(const AMDGPUCompiler&) = delete;

diff --git a/xla/service/gpu/fusions/triton/triton_support.cc b/xla/service/gpu/fusions/triton/triton_support.cc
@@ -425,7 +425,8 @@ bool IsTritonSupportedDataType(PrimitiveType type,
       return true;
     case F8E5M2:
     case F8E4M3FN:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version);
+      return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
+             std::holds_alternative<se::RocmComputeCapability>(gpu_version) ;
     case BF16:
       return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
              (std::holds_alternative<se::RocmComputeCapability>(gpu_version) &&
@@ -520,6 +521,10 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     ret.insert(HloOpcode::kRemainder);
     ret.insert(HloOpcode::kPower);
   }
+  if (element_type == PrimitiveType::F16 ||
+      element_type == PrimitiveType::BF16) {
+    ret.insert(HloOpcode::kDivide);
+  }
   return ret;
 }
 

diff --git a/xla/service/gpu/fusions/triton/triton_support_test.cc b/xla/service/gpu/fusions/triton/triton_support_test.cc
@@ -344,12 +344,8 @@ ENTRY triton_computation {
                                      data_type, opcode));
 
   bool skip_failure_branch_to_avoid_crash =
-      (opcode == HloOpcode::kDivide &&
-      (data_type == PrimitiveType::BF16 || data_type == PrimitiveType::F16 ||
-       data_type == PrimitiveType::F8E5M2 ||
-       data_type == PrimitiveType::F8E4M3FN)) ||
       ((opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum) &&
-       data_type == PrimitiveType::F8E5M2 || data_type == PrimitiveType::F8E4M3FN);
+      (data_type == PrimitiveType::F8E5M2 || data_type == PrimitiveType::F8E4M3FN));
 
   RunSupportTest(std::move(ti), /*output_tile_sizes=*/{1, 32}, cc,
                  skip_failure_branch_to_avoid_crash);