yiliu30
diff --git a/‎.lintrunner.toml
+1 b/‎.lintrunner.toml
+1
diff --git a/‎build_variables.bzl
+1 b/‎build_variables.bzl
+1
diff --git a/‎caffe2/CMakeLists.txt
+4 b/‎caffe2/CMakeLists.txt
+4
diff --git a/‎setup.py
+1 b/‎setup.py
+1
diff --git a/‎test/inductor/test_cuda_cpp_wrapper.py
+57-30 b/‎test/inductor/test_cuda_cpp_wrapper.py
+57-30
diff --git a/‎test/inductor/test_memory_planning.py
-2 b/‎test/inductor/test_memory_planning.py
-2
diff --git a/‎test/inductor/test_triton_kernels.py
-1 b/‎test/inductor/test_triton_kernels.py
-1
diff --git a/‎torch/_inductor/codegen/common.py
+1 b/‎torch/_inductor/codegen/common.py
+1
diff --git a/‎torch/_inductor/codegen/cpp_utils.py
+1 b/‎torch/_inductor/codegen/cpp_utils.py
+1
diff --git a/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
+8-3 b/‎torch/_inductor/codegen/cpp_wrapper_cpu.py
+8-3
diff --git a/‎torch/_inductor/codegen/wrapper.py
+1 b/‎torch/_inductor/codegen/wrapper.py
+1
diff --git a/‎torch/_inductor/codegen/xpu/device_op_overrides.py
+46-1 b/‎torch/_inductor/codegen/xpu/device_op_overrides.py
+46-1
@@ -263,6 +263,7 @@ exclude_patterns = [
     'torch/csrc/jit/**/*',
     'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
     'torch/csrc/utils/pythoncapi_compat.h',
+    'torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h',
 ]
 init_command = [
     'python3',
 
@@ -792,6 +792,7 @@ libtorch_python_xpu_sources = [
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
     "torch/csrc/xpu/Stream.cpp",
+    "torch/csrc/inductor/aoti_torch/shim_xpu.cpp",
 ]
 
 libtorch_python_core_sources = [
 
@@ -1116,6 +1116,10 @@ if(USE_XPU)
 
     # Set cached ${ATen_XPU_INCLUDE_DIRS} to torch
     include_directories(SYSTEM ${ATen_XPU_INCLUDE_DIRS})
+    message(INFO "Install ${TORCH_XPU_OPS_DIR}/src/ATen/xpu to ${TORCH_INSTALL_INCLUDE_DIR}/ATen/xpu")
+    install(DIRECTORY "${TORCH_XPU_OPS_DIR}/src/ATen/xpu"
+        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/ATen/
+        FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp")
 
   endif()
 endif()
 
@@ -1289,6 +1289,7 @@ def main():
         "include/torch/csrc/inductor/aoti_torch/*.h",
         "include/torch/csrc/inductor/aoti_torch/c/*.h",
         "include/torch/csrc/inductor/aoti_torch/generated/*.h",
+        "include/torch/csrc/inductor/aoti_torch/generated/extend/*.h",
         "include/torch/csrc/jit/*.h",
         "include/torch/csrc/jit/backends/*.h",
         "include/torch/csrc/jit/generated/*.h",
 
@@ -7,11 +7,12 @@
 import torch
 from torch._inductor import config
 from torch._inductor.test_case import TestCase as InductorTestCase
+from torch._inductor.utils import is_gpu
 from torch.testing._internal.common_device_type import (
     get_desired_device_type_test_bases,
 )
 from torch.testing._internal.common_utils import slowTest, TEST_WITH_ASAN
-from torch.testing._internal.inductor_utils import HAS_CUDA
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
 try:
@@ -38,29 +39,40 @@
     raise
 
 
-_desired_test_bases = get_desired_device_type_test_bases()
-RUN_CUDA = (
-    HAS_CUDA
-    and any(getattr(x, "device_type", "") == "cuda" for x in _desired_test_bases)
+_desired_test_bases = get_desired_device_type_test_bases(allow_xpu=True)
+RUN_GPU = (
+    HAS_GPU
+    and any(is_gpu(getattr(x, "device_type", "")) for x in _desired_test_bases)
     and not TEST_WITH_ASAN
 )
 
 
-class CudaWrapperTemplate:
+class GpuWrapperTemplate:
     pass
 
 
-class TestCudaWrapper(InductorTestCase):
-    device = "cuda"
+class TestGpuWrapper(InductorTestCase):
+    device = GPU_TYPE
 
 
-class DynamicShapesCudaWrapperCudaTests(InductorTestCase):
-    device = "cuda"
+class DynamicShapesGpuWrapperGpuTests(InductorTestCase):
+    device = GPU_TYPE
 
 
-test_failures_cuda_wrapper = {
+test_failures_gpu_wrapper = {
     "test_mm_plus_mm2_cuda_dynamic_shapes": test_torchinductor.TestFailure(
-        ("cuda_wrapper",), is_skip=True
+        ("gpu_wrapper",), is_skip=True
+    ),
+    "test_randint_xpu": test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=False),
+    "test_randint_xpu_dynamic_shapes": test_torchinductor.TestFailure(
+        ("gpu_wrapper",), is_skip=False
+    ),
+    # ATen ops: scaled_dot_product_efficient_attention not implemented on XPU.
+    "test_scaled_dot_product_efficient_attention_xpu": test_torchinductor.TestFailure(
+        ("gpu_wrapper",), is_skip=False
+    ),
+    "test_scaled_dot_product_efficient_attention_xpu_dynamic_shapes": test_torchinductor.TestFailure(
+        ("gpu_wrapper",), is_skip=False
     ),
 }
 
@@ -114,20 +126,34 @@ def fn(self):
     fn.__dict__ = copy.deepcopy(func.__dict__)
     if condition:
         setattr(
-            CudaWrapperTemplate,
+            GpuWrapperTemplate,
             test_name,
             fn,
         )
 
 
-if RUN_CUDA:
+if RUN_GPU:
 
     class BaseTest(NamedTuple):
         name: str
-        device: str = "cuda"
+        device: str = GPU_TYPE
         tests: InductorTestCase = test_torchinductor.GPUTests()
         check_code: bool = True
 
+    # XPU Not implemented yet
+    XPU_BASE_TEST_SKIP = [
+        "test_foreach_cpp_wrapper",
+        "test_enable_dynamic_shapes_cpp_wrapper",
+        "test_dynamic_shapes_persistent_reduction_mixed_x_dim",
+        "test_cat_slice_cat",
+        "test_mm_plus_mm2",
+        "test_mm_plus_mm3",
+        "test_addmm",
+        "test_linear_relu",
+        "test_fft_real_input",
+        "test_fft_real_input_real_output",
+    ]
+
     # Maintain two separate test lists for cuda and cpp for now
     for item in [
         BaseTest("test_add_complex"),
@@ -236,40 +262,41 @@ class BaseTest(NamedTuple):
             tests=test_select_algorithm.TestSelectAlgorithm(),
         ),
     ]:
+        if item.device == "xpu" and item.name in XPU_BASE_TEST_SKIP:
+            continue
         make_test_case(item.name, item.device, item.tests, check_code=item.check_code)
 
     from torch._inductor.utils import is_big_gpu
 
-    if is_big_gpu(0):
+    if GPU_TYPE == "cuda" and is_big_gpu(0):
         skip_list = ["test_addmm", "test_linear_relu"]
         # need to skip instead of omit, otherwise fbcode ci can be flaky
         for test_name in skip_list:
-            test_failures_cuda_wrapper[
+            test_failures_gpu_wrapper[
                 f"{test_name}_cuda"
-            ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
-            test_failures_cuda_wrapper[
-                f"{test_name}_cuda_dynamic_shapes"
-            ] = test_torchinductor.TestFailure(("cuda_wrapper",), is_skip=True)
+            ] = test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
+            test_failures_gpu_wrapper[
+                f"{test_name}_gpu_dynamic_shapes"
+            ] = test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
 
     test_torchinductor.copy_tests(
-        CudaWrapperTemplate, TestCudaWrapper, "cuda_wrapper", test_failures_cuda_wrapper
+        GpuWrapperTemplate, TestGpuWrapper, "gpu_wrapper", test_failures_gpu_wrapper
     )
 
-    DynamicShapesCudaWrapperTemplate = (
-        test_torchinductor_dynamic_shapes.make_dynamic_cls(CudaWrapperTemplate)
+    DynamicShapesGpuWrapperTemplate = (
+        test_torchinductor_dynamic_shapes.make_dynamic_cls(GpuWrapperTemplate)
     )
 
     test_torchinductor.copy_tests(
-        DynamicShapesCudaWrapperTemplate,
-        DynamicShapesCudaWrapperCudaTests,
-        "cuda_wrapper",
-        test_failures_cuda_wrapper,
+        DynamicShapesGpuWrapperTemplate,
+        DynamicShapesGpuWrapperGpuTests,
+        "gpu_wrapper",
+        test_failures_gpu_wrapper,
         xfail_prop="_expected_failure_dynamic_wrapper",
     )
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    print(f"FS: run_cuda {RUN_CUDA}")
-    if RUN_CUDA:
+    if RUN_GPU:
         run_tests(needs="filelock")
@@ -3,7 +3,6 @@
 import sys
 import unittest
 
-from torch.testing._internal.common_device_type import expectedFailureXPU
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_WINDOWS,
@@ -71,7 +70,6 @@ def test_python_wrapper(self):
         )
         self.assertTrue(same(f(*args), result))
 
-    @expectedFailureXPU
     def test_cpp_wrapper(self):
         f, args = self._generate(device=GPU_TYPE)
         compiled = torch.compile(f, dynamic=True)
 
@@ -3265,7 +3265,6 @@ def f(x, y):
         gm = make_fx(f, tracing_mode=tracing_mode)(x, x)
         self.assertEqual(gm(x, x), x + x)
 
-    @skipIfXpu
     @requires_gpu
     @patch.object(torch._inductor.config, "cpp_wrapper", True)
     @patch.object(torch._inductor.config, "triton.autotune_at_compile_time", True)
 
@@ -382,6 +382,7 @@ def init_backend_registration():
             "xpu",
             TritonScheduling,
             PythonWrapperCodegen,
+            CppWrapperGpu,
         )
 
     private_backend = torch._C._get_privateuse1_backend_name()
 
@@ -81,6 +81,7 @@
 DEVICE_TO_ATEN = {
     "cpu": "at::kCPU",
     "cuda": "at::kCUDA",
+    "xpu": "at::kXPU",
 }
 
 LAYOUT_TO_ATEN = {
 
@@ -198,11 +198,16 @@ class RAIIPyObject {
             }}
             """
         )
-        extend_aoti_path = (
+        extend_aoti_c_shim_include = (
             f"torch/csrc/inductor/aoti_torch/generated/extend/c_shim_{self.device}.h"
         )
-        if os.path.exists(extend_aoti_path):
-            self.header.splice(f"#include <{extend_aoti_path}>")
+        extend_aoti_c_shim_path = os.path.join(
+            os.path.dirname(torch.__file__),
+            "include",
+            extend_aoti_c_shim_include,
+        )
+        if os.path.exists(extend_aoti_c_shim_path):
+            self.header.splice(f"#include <{extend_aoti_c_shim_include}>")
 
         enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
             "linux",
 
@@ -782,6 +782,7 @@ def write_kernel_autotune_defs_header(self) -> None:
                 async_compile = AsyncCompile()
                 generate_example_value = AlgorithmSelectorCache.generate_example_value
                 empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
             """
         )
 
 
@@ -16,7 +16,7 @@ def device_guard(self, device_idx):
         return f"torch.xpu._DeviceGuard({device_idx})"
 
     def cpp_device_guard(self):
-        return "at::xpu::XPUGuard"
+        return "at::DeviceGuard"
 
     def cpp_aoti_device_guard(self):
         return "AOTIXpuGuard"
@@ -30,5 +30,50 @@ def cpp_aoti_stream_guard(self):
     def cpp_getStreamFromExternal(self):
         return "at::xpu::getStreamFromExternal"
 
+    def kernel_header(self):
+        source_codes = """
+        #include <torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h>
+        """
+        return source_codes
+
+    def kernel_driver(self):
+        source_codes = """
+            namespace {
+
+            struct Grid {
+                Grid(uint32_t x, uint32_t y, uint32_t z)
+                  : grid_x(x), grid_y(y), grid_z(z) {}
+                uint32_t grid_x;
+                uint32_t grid_y;
+                uint32_t grid_z;
+
+                bool is_non_zero() {
+                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
+                }
+            };
+
+            }  // anonymous namespace
+
+        """
+        return source_codes
+
+    def abi_compatible_header(self):
+        return """
+        #include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
+        #include <torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h>
+        """
+
+    def cpp_stream_type(self):
+        return "sycl::queue*"
+
+    def aoti_get_stream(self):
+        return "aoti_torch_get_current_xpu_stream"
+
+    def cpp_kernel_type(self):
+        return "std::unique_ptr<sycl::kernel>"
+
+    def cpp_device_ptr(self):
+        return "void *"
+
 
 register_device_op_overrides("xpu", XPUDeviceOpOverrides())
Original file line number	Diff line number	Diff line change
`@@ -263,6 +263,7 @@ exclude_patterns = [`
`263`	`263`	`'torch/csrc/jit/*/',`
`264`	`264`	`'torch/csrc/jit/serialization/mobile_bytecode_generated.h',`
`265`	`265`	`'torch/csrc/utils/pythoncapi_compat.h',`
	`266`	`+ 'torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h',`
`266`	`267`	`]`
`267`	`268`	`init_command = [`
`268`	`269`	`'python3',`
Original file line number	Diff line number	Diff line change
`@@ -792,6 +792,7 @@ libtorch_python_xpu_sources = [`
`792`	`792`	`"torch/csrc/xpu/Event.cpp",`
`793`	`793`	`"torch/csrc/xpu/Module.cpp",`
`794`	`794`	`"torch/csrc/xpu/Stream.cpp",`
	`795`	`+ "torch/csrc/inductor/aoti_torch/shim_xpu.cpp",`
`795`	`796`	`]`
`796`	`797`
`797`	`798`	`libtorch_python_core_sources = [`
Original file line number	Diff line number	Diff line change
`@@ -382,6 +382,7 @@ def init_backend_registration():`
`382`	`382`	`"xpu",`
`383`	`383`	`TritonScheduling,`
`384`	`384`	`PythonWrapperCodegen,`
	`385`	`+ CppWrapperGpu,`
`385`	`386`	`)`
`386`	`387`
`387`	`388`	`private_backend = torch._C._get_privateuse1_backend_name()`
Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@`
`81`	`81`	`DEVICE_TO_ATEN = {`
`82`	`82`	`"cpu": "at::kCPU",`
`83`	`83`	`"cuda": "at::kCUDA",`
	`84`	`+ "xpu": "at::kXPU",`
`84`	`85`	`}`
`85`	`86`
`86`	`87`	`LAYOUT_TO_ATEN = {`
Original file line number	Diff line number	Diff line change
`@@ -782,6 +782,7 @@ def write_kernel_autotune_defs_header(self) -> None:`
`782`	`782`	`async_compile = AsyncCompile()`
`783`	`783`	`generate_example_value = AlgorithmSelectorCache.generate_example_value`
`784`	`784`	`empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda`
	`785`	`+ empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu`
`785`	`786`	`"""`
`786`	`787`	`)`
`787`	`788`