more comments

zhenglongjiepheonix · zhenglongjiepheonix · commit c689402b5d19 · 2024-08-15T21:49:12.000+02:00
diff --git a/optimum/fx/parallelization/api.py b/optimum/fx/parallelization/api.py
@@ -54,7 +54,8 @@ def parallelize_model(
 
     Args:
         model (str):
-            Model to parallelize, a model id on the Huggingface Hub.
+            Model to parallelize, a model id on the Huggingface Hub or path to a local directory containing config and weights
+            of the model.
         parallel_ctx (ParallelExecutionCtx):
             Parallel execution context containing process groups the current process belongs to.
         *model_args (Any):
diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py
@@ -68,6 +68,17 @@ def __init__(self, graph: Graph):
 
 
 class DecompositionInterpreter(Interpreter):
+    """
+    DecompositionInterpreter takes the high-level graph module, run the iternal nodes following the topo order, and decompose
+    high-level pytorch operators into core aten operators by utilizing torch dispatch infrastructure along the way. Note
+    that certain primitive layers(like `nn.Linear`, `nn.Embedding`, and activation layers) are preserved because we have specific
+    heuristic based parallelization strategy for them and we can conveniently replace them into their parallelized counterparts
+    in the orignal graph module.
+
+    Note that the traced graph is a low-level equivalent representation of the original graph module, and is only used for
+    parallel axis propagation and analysis, the original graph module is still used for real execution.
+    """
+
     def __init__(
         self, module: GraphModule, new_graph: Graph, decomposition_table=None, leaf_function_targets=None, **kwargs
     ):