[Compiled Autograd] Reorder accumulate grad nodes (pytorch#121735)

jansel · pytorchmergebot · commit 040b92575317 · 2024-03-16T04:29:56.000Z
Pull Request resolved: pytorch#121735 Approved by: https://github.com/xmfan
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
@@ -1218,6 +1218,8 @@ def wrap_test_class(orig_cls):
     "test_backward_tensorlist_input_requires_list_grads_none_or_Tensor",  # AssertionError: "None or Tensor"
     "test_backward_tensorlist_input_requires_list_grads_with_same_numel",  # AssertionError: "3 gradients
     "test_save_for_backward_inputs_are_namedtuple",  # torch._dynamo.exc.Unsupported: 'skip function
+    "test_autograd_function_backed_op",  # RuntimeError: compiled_args not implemented
+    "test_setitem",  # AssertionError: Tensor-likes are not close!
 }
 
 if not HAS_CUDA:
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
-from typing import List, Optional
+import itertools
+from typing import Dict, List, Optional
 
 import torch
 from torch._dynamo.external_utils import call_backward, call_hook
@@ -195,6 +196,7 @@ def end_capture(self, outputs):
             (self.fx_tracer.create_arg(self.to_proxy(outputs)),),
             {},
         )
+        self.reorder_accumulate_grad_nodes()
         graph = GraphModule(
             self.fx_tracer.root, self.fx_tracer.graph, "CompiledAutograd"
         )
@@ -207,6 +209,24 @@ def end_capture(self, outputs):
         )
         return self.compiler_fn(graph)
 
+    def reorder_accumulate_grad_nodes(self):
+        """
+        Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
+        the graph.  This differs from eager mode, which schedules them as soon as possible. This
+        pass attempts to reorder the graph to mimic eager behavior.
+        """
+        order: Dict[torch.fx.Node, int] = {}
+        counter = itertools.count()
+        target = torch.ops.inductor.accumulate_grad_.default
+        last = None
+        for node in [*self.fx_tracer.graph.nodes]:
+            if node.op == "call_function" and node.target == target:
+                arg = max(node.args, key=order.get)  # type: ignore[arg-type]
+                if arg is not last:
+                    arg.append(node)
+            order[node] = next(counter)
+            last = node
+
     def to_proxy(self, t):
         if t is None:
             return None

Original file line number	Diff line number	Diff line change
`@@ -1218,6 +1218,8 @@ def wrap_test_class(orig_cls):`
`1218`	`1218`	`"test_backward_tensorlist_input_requires_list_grads_none_or_Tensor", # AssertionError: "None or Tensor"`
`1219`	`1219`	`"test_backward_tensorlist_input_requires_list_grads_with_same_numel", # AssertionError: "3 gradients`
`1220`	`1220`	`"test_save_for_backward_inputs_are_namedtuple", # torch._dynamo.exc.Unsupported: 'skip function`
	`1221`	`+ "test_autograd_function_backed_op", # RuntimeError: compiled_args not implemented`
	`1222`	`+ "test_setitem", # AssertionError: Tensor-likes are not close!`
`1221`	`1223`	`}`
`1222`	`1224`
`1223`	`1225`	`if not HAS_CUDA:`