kblaszczak-intel
diff --git a/‎benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+1-1 b/‎benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+1-1
diff --git a/‎test/inductor/test_layout_optim.py
+173 b/‎test/inductor/test_layout_optim.py
+173
diff --git a/‎test/test_fake_tensor.py
+7 b/‎test/test_fake_tensor.py
+7
diff --git a/‎torch/_functorch/aot_autograd.py
+24-1 b/‎torch/_functorch/aot_autograd.py
+24-1
diff --git a/‎torch/_inductor/codegen/triton.py
+77-4 b/‎torch/_inductor/codegen/triton.py
+77-4
diff --git a/‎torch/_inductor/codegen/wrapper.py
+11 b/‎torch/_inductor/codegen/wrapper.py
+11
@@ -50,4 +50,4 @@ timm_vovnet,pass,8
 tts_angular,pass,10
 vgg16,pass,8
 vision_maskrcnn,fail_accuracy,167
-yolov3,pass,10
+yolov3,pass,11
@@ -0,0 +1,173 @@
+# Owner(s): ["module: inductor"]
+import copy
+import os
+
+import torch
+from torch import nn
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._dynamo.utils import same
+from torch.testing._internal.common_utils import TEST_WITH_ROCM
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+USE_DDP_WRAPPER = os.environ.get("USE_DDP_WRAPPER", "1") == "1"
+
+
+class Model2Conv(nn.Module):
+    def __init__(self, dim=512, manual_graph_break=False):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, dim, kernel_size=3, stride=2, bias=False)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, stride=2, bias=False)
+        self.manual_graph_break = manual_graph_break
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.manual_graph_break:
+            torch._dynamo.graph_break()
+        x = self.conv2(x)
+        return x
+
+    def get_example_inputs(self):
+        return (torch.rand(2, 3, 16, 16),)
+
+
+class TestLayoutOptim(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        import torch.distributed as dist
+
+        port = 10001
+        dist.init_process_group(
+            backend="nccl", init_method=f"tcp://localhost:{port}", world_size=1, rank=0
+        )
+
+    def verify_accuracy(
+        self, model_class, use_ddp_wrapper=USE_DDP_WRAPPER, is_train=False
+    ):
+        # there are 2 potential ways to introduce graph breaks
+        # 1. manually
+        # 2. using DDP
+        # if we are not using DDP to introduce graph breaks, do that manually
+        def wrap_mod(m):
+            if is_train:
+
+                def f(*inp):
+                    x = m(*inp)
+                    x.sum().backward()
+
+                    grads = []
+                    for name, param in m.named_parameters():
+                        grad = param.grad
+                        if param.grad is None:
+                            grad = torch.zeros_like(param)
+                        grads.append(grad)
+                    return grads
+
+                return f
+            else:
+                return m
+
+        manual_graph_break = not use_ddp_wrapper
+        mod = model_class(manual_graph_break=manual_graph_break).cuda()
+        inp = [t.cuda() for t in mod.get_example_inputs()]
+        expected_out = wrap_mod(mod)(*inp)
+
+        fp64_mod = copy.deepcopy(mod).to(torch.float64)
+        fp64_inp = [t.to(torch.float64) for t in copy.deepcopy(inp)]
+        fp64_out = wrap_mod(fp64_mod)(*fp64_inp)
+
+        if use_ddp_wrapper:
+            from torch.nn.parallel import DistributedDataParallel as DDP
+
+            ddp_wrapped_mod = DDP(mod)
+            opt_mod = torch.compile(wrap_mod(ddp_wrapped_mod))
+        else:
+            opt_mod = torch.compile(wrap_mod(mod))
+        actual_out = opt_mod(*inp)
+
+        if is_train:
+            self.assertTrue(same(expected_out, actual_out, fp64_ref=fp64_out))
+        else:
+            expected_sum = expected_out.sum()
+            actual_sum = actual_out.sum()
+            print(f"Expected sum {expected_sum}, actual sum {actual_sum}")
+            self.assertTrue(same(expected_out, actual_out, fp64_ref=fp64_out))
+
+    def verify_accuracy_for_infer(self, *args, **kwargs):
+        self.verify_accuracy(*args, **kwargs, is_train=False)
+
+    def verify_accuracy_for_train(self, *args, **kwargs):
+        self.verify_accuracy(*args, **kwargs, is_train=True)
+
+    def test_2conv_with_graph_break(self):
+        """
+        Make sure graph break does not cause any accuracy issue.
+        """
+        self.verify_accuracy_for_infer(Model2Conv)
+
+    def test_3conv_with_graph_break(self):
+        class Model(nn.Module):
+            def __init__(
+                self, dim=512, patch_size=7, kernel_size=7, manual_graph_break=False
+            ):
+                super().__init__()
+                self.seq = nn.Sequential(
+                    nn.Conv2d(
+                        3, dim, kernel_size=patch_size, stride=patch_size, bias=False
+                    ),
+                    nn.Conv2d(
+                        dim, dim, kernel_size, groups=dim, padding="same", bias=False
+                    ),
+                )
+                self.conv = nn.Conv2d(dim, dim, kernel_size=1, bias=False)
+                self.manual_graph_break = manual_graph_break
+
+            def forward(self, x):
+                x = self.seq(x)
+                if self.manual_graph_break:
+                    torch._dynamo.graph_break()
+                x = self.conv(x)
+                return x
+
+            def get_example_inputs(self):
+                return (torch.randn(2, 3, 16, 16),)
+
+        self.verify_accuracy_for_infer(Model)
+
+    def test_keep_output_layout_infer(self):
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(
+                    3, 128, kernel_size=3, padding=1, stride=1, bias=False
+                )
+
+            def forward(self, x):
+                x = self.conv(x)
+                return x
+
+            def get_example_inputs(self):
+                return (torch.randn(2, 3, 5, 5),)
+
+        mod = Model().cuda()
+        inp = [t.cuda() for t in mod.get_example_inputs()]
+        out = mod(*inp)
+
+        opt_mod = torch.compile(mod)
+        opt_out = opt_mod(*inp)
+
+        # We should be able to do view on eager output
+        out.view(5, -1)
+
+        # We should be able to do view on the output of the optimized module
+        # Note that if the output is channels last, the view op will fail.
+        opt_out.view(5, -1)
+
+    def test_training_acc(self):
+        self.verify_accuracy_for_train(Model2Conv)
+
+
+if __name__ == "__main__":
+    if HAS_CUDA and not TEST_WITH_ROCM:
+        run_tests()
@@ -632,6 +632,13 @@ def test_aten_slice_scatter_multi_device(self):
         self.checkType(r3, "cpu", (4, 4))
         self.checkType(out, "cpu", (4, 4))
 
+    def test__adaptive_avg_pool2d_backward(self):
+        with FakeTensorMode():
+            grad_out = torch.rand(2, 3, 4, 4)
+            inp = torch.rand(2, 3, 4, 4).to(memory_format=torch.channels_last)
+            grad_in = torch.ops.aten._adaptive_avg_pool2d_backward(grad_out, inp)
+            self.assertTrue(torch._prims_common.suggest_memory_format(grad_in) == torch.channels_last)
+
 
 class FakeTensorConstHandling(TestCase):
     def assertConst(self, *args):
 
@@ -2765,6 +2765,14 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Any], aot_config: AOTConfig,
                 # We are not clearing flat_args here because
                 # 1) There is a check in the the debug compiler at the end
                 # 2) It does not matter as these are fake tensors
+
+            # the compiler need to use this field to find the original modol outputs
+            # from the AOTAutograd fwd module's outputs. Thus compiler can make sure
+            # optimizations like layout optimization does not change those tensors'
+            # layout.
+            # TODO once https://github.com/pytorch/pytorch/pull/100652/files#r1212002707 is in
+            # change to access fw_metadata from the global tracing context.
+            fw_module.meta["original_output_start_index"] = fw_metadata.num_mutated_inputs
             compiled_fw_func = aot_config.fw_compiler(
                 fw_module, adjusted_flat_args
             )
@@ -2981,9 +2989,24 @@ def call_compiled_backward():
                 if CompiledFunction.compiled_bw is None:
                     assert all(a is not None for a in all_args)
                     context = torch._C._DisableAutocast if disable_amp else nullcontext
+
+                    placeholder_list = fx_placeholder_vals(bw_module)
+
+                    # saved activations can have different stride to eager if
+                    # the compiler does layout optimization. We should restride the
+                    # tensor passed in for compiling the backward graph using the
+                    # saved tensor's stride.
+                    for i in range(len(placeholder_list)):
+                        ph_arg = placeholder_list[i]
+                        real_arg = all_args[i]
+                        if not isinstance(ph_arg, torch.Tensor):
+                            continue
+                        if ph_arg.stride() != real_arg.stride():
+                            placeholder_list[i] = ph_arg.as_strided(ph_arg.size(), real_arg.stride())
+
                     with tracing(saved_context), context(), track_graph_compiling(aot_config, "backward"):
                         CompiledFunction.compiled_bw = aot_config.bw_compiler(
-                            bw_module, fx_placeholder_vals(bw_module)
+                            bw_module, placeholder_list
                         )
 
                 ctx.maybe_clear_saved_tensors()
 
@@ -25,11 +25,13 @@
     get_fused_kernel_name,
     get_kernel_category_by_source_code,
     get_kernel_metadata,
+    green_text,
     next_power_of_2,
     sympy_product,
     sympy_subs,
     sympy_symbol,
     unique,
+    yellow_text,
 )
 from ..virtualized import ops, V
 
@@ -1425,19 +1427,22 @@ def codegen_kernel_benchmark(self):
         with result.indent():
             name_cnt = itertools.count()
             var_names = []
-            for arg_name in call_args:
+            for arg_name, arg_sig in zip(call_args, signature):
                 var_name = f"arg_{next(name_cnt)}"
                 buf = V.graph.get_buffer(arg_name)
                 if buf:
                     result.writeline(
-                        f"{var_name} = rand_strided({tuple(buf.get_size())}, {tuple(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
                     )
                 elif arg_name in V.graph.constants:
                     # note that random seed is put in V.graph.constants
                     const_tensor = V.graph.constants[arg_name]
                     result.writeline(
-                        f"{var_name} = rand_strided({tuple(const_tensor.size())}, {tuple(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # noqa: B950 line too long
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # noqa: B950 line too long
                     )
+                elif isinstance(arg_sig, SizeArg):
+                    symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
+                    result.writeline(f"{var_name} = {symval_hint}")
                 else:
                     raise KeyError(
                         f"Don't find the buffer or const tensor for {arg_name}"
@@ -1457,7 +1462,7 @@ def codegen_kernel_benchmark(self):
                     f"torch.cuda.set_device({index})"
                 )  # no-op to ensure context
                 for tree in self.range_trees:
-                    expr = pexpr(tree.numel)
+                    expr = pexpr(V.graph.sizevars.size_hint(tree.numel))
                     if tree.prefix != "r" or self.inside_reduction:
                         extra_args.append(expr)
                     if tree.prefix != "r":
@@ -1730,6 +1735,71 @@ def call_kernel(self, name: str):
             V.graph.scheduler.current_device.index,
         )
 
+    def warn_mix_layout(self, kernel_name):
+        """
+        Print message if the kernel have mixed layout inputs.
+        Only care about 4D tensor for now.
+        """
+        if (
+            len(self.args.input_buffers) == 1
+            and len(self.args.output_buffers) == 1
+            and len(self.args.inplace_buffers) == 0
+        ):
+            # even if input buffer and output buffer have different layout,
+            # this can be a layout conversion kernel. No need to warn for
+            # the mix layouts.
+            return
+
+        argdefs, call_args, signature = self.args.python_argdefs()
+        uniform_stride_order = None
+        for arg_name in call_args:
+            buf = V.graph.get_buffer(arg_name)
+            if buf and len(buf.layout.size) == 4:
+                # ignore the tensor if only 1 dimention is non-zero
+                if len([x for x in buf.layout.size if x == 1]) == 3:
+                    continue
+                stride_order = ir.get_stride_order(buf.layout.stride)
+                if uniform_stride_order is None:
+                    uniform_stride_order = stride_order
+                elif uniform_stride_order != stride_order:
+                    msg = yellow_text(
+                        f"Expected stride order {uniform_stride_order}, but found stride order"
+                        + f" {stride_order} for kernel {kernel_name}"
+                    )
+                    log.warning(msg)
+
+                    stride_order_list = [
+                        ir.get_stride_order(V.graph.get_buffer(name).layout.stride)
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    size_list = [
+                        V.graph.get_buffer(name).layout.size
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    source_list = [
+                        "GraphInput"
+                        if name in V.graph.graph_inputs
+                        else "IntermediateBuffer"
+                        if name in V.graph.name_to_buffer
+                        else None
+                        for name in call_args
+                    ]
+
+                    msg = yellow_text(
+                        f"  param names {argdefs}\n  buf names {call_args}\n  strides {stride_order_list}"
+                        + f"\n  sizes {size_list}\n  sources {source_list}\n"
+                    )
+                    log.warning(msg)
+                    return
+        msg = green_text(
+            f"All the inputs for the triton kernel {kernel_name} have uniform layout"
+        )
+        log.warning(msg)
+
     def create_cse_var(self, *args, **kwargs):
         return TritonCSEVariable(*args, **kwargs)
 
@@ -2014,6 +2084,9 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
 
         kernel.call_kernel(kernel_name)
 
+        if config.warn_mix_layout:
+            kernel.warn_mix_layout(kernel_name)
+
         if (
             V.graph.wrapper_code.supports_intermediate_hooks
             and config.generate_intermediate_hooks
 
@@ -342,6 +342,14 @@ def get_output_refs(self):
     def mark_output_type(self):
         return
 
+    def codegen_input_size_asserts(self):
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+            size = self.codegen_shape_tuple(buf.get_size())
+            stride = self.codegen_shape_tuple(buf.get_stride())
+            self.prefix.writeline(f"assert_size_stride({name}, {size}, {stride})")
+
     def write_prefix(self):
         self.prefix.splice(
             """
@@ -360,7 +368,10 @@ def call(args):
                 lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
                 self.prefix.writeline(f"{lhs} = args")
                 self.prefix.writeline("args.clear()")
+
             self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+            if config.size_asserts:
+                self.codegen_input_size_asserts()
 
     def write_get_cuda_stream(self, index):
         self.write_triton_header_once()