more comments

zhenglongjiepheonix · zhenglongjiepheonix · commit 252c3b7b12e9 · 2024-08-16T22:06:02.000+02:00
diff --git a/optimum/fx/parallelization/api.py b/optimum/fx/parallelization/api.py
@@ -15,7 +15,7 @@
 import importlib
 import os
 from functools import partial
-from typing import List
+from typing import Callable, List
 
 import torch
 from torch.fx import GraphModule
@@ -48,7 +48,7 @@ def parallelize_model(
     parallel_ctx: ParallelExecutionCtx,
     *model_args,
     **kwargs,
-):
+) -> Callable:
     """
     API for automatic model parallelism through Pytorch FX.
 
diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py
@@ -72,7 +72,7 @@ class DecompositionInterpreter(Interpreter):
     DecompositionInterpreter takes the high-level graph module, run the iternal nodes following the topo order, and decompose
     high-level pytorch operators into core aten operators by utilizing torch dispatch infrastructure along the way. Note
     that certain primitive layers(like `nn.Linear`, `nn.Embedding`, and activation layers) are preserved because we have specific
-    heuristic based parallelization strategy for them and we can conveniently replace them into their parallelized counterparts
+    heuristic based parallelization strategy for them so that we can conveniently replace them into their parallelized counterparts
     in the orignal graph module.
 
     Note that the traced graph is a low-level equivalent representation of the original graph module, and is only used for
@@ -106,7 +106,6 @@ def placeholder(self, target, args, kwargs):
             track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
 
         out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out)
-        # TODO handle case where the first character of target is '*'
         return out
 
     def call_function(self, target, args, kwargs):
@@ -187,9 +186,25 @@ def run(self, *args, **kwargs):
 
 def decompose_and_functionalize(
     graph_module: GraphModule,
-    decomposition_table: Dict = core_aten_decompositions(),
+    decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(),
     leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention],
 ) -> Callable:
+    """
+    API to decompose and funcitonalize a high-level graph module.
+
+    Args:
+        graph_module (GraphModule):
+            The high-level graph module to be decomposed and functionalized.
+        decomposition_table (Dict[torch._ops.OperatorBase, Callable], defaults to `core_aten_decompostions()`):
+            The lookup table which maps high-level torch op to their equivalent low-level implementation.
+        leaf_function_targets (List[Callable], defaults to `[F.scaled_dot_product_attention]`):
+            Functions which will not be traced through for convenience, `F.scaled_dot_product_attention` is
+            treated as a leaf function by default so that we don't have to deal with all detailed version of
+            sdpas in the traced graph.
+
+    Returns:
+        Callable: a wrapper which returns the traced low-level graph when called with concrete arguments.
+    """
     new_graph = Graph(owning_module=graph_module)
     interp = DecompositionInterpreter(graph_module, new_graph, decomposition_table, leaf_function_targets)
 
diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py
@@ -170,8 +170,8 @@ class ParallelAxisSolverPass(AnalyzeBase):
             - Optimal Solution. Note that since we return the first solution we find, then it might not be optimal in terms of
                 memory consumption and communication overhead. But again we can adjust the order of search and try parallelize
                 as much as we can first before fall back to non-parallelized search paths. And we don't pay too much attention
-                on calculating communication overhead because in practice they are bounded by number of certain layers in the graph
-                under the constraint that only certain layers are allowed to communicate.
+                on calculating communication overhead because in practice they are bounded under the constraint that only certain
+                layers are allowed to communicate.
 
     Our goal is not to solve an optimization problem which tries to give a best solution of parallelizing any model under memory/hardware
     constraints, but rather a cheap solution which relieves you from writing boilerplate code for parallelizing layers of different models.