fix

zhenglongjiepheonix · zhenglongjiepheonix · commit 40880a3dc6af · 2024-09-20T19:42:23.000+02:00
diff --git a/optimum/fx/parallelization/backend/base.py b/optimum/fx/parallelization/backend/base.py
@@ -183,6 +183,11 @@ def create_parallel_embedding(
 
         return VocabParallelEmbedding(parallel_ctx, mod)
 
+    def create_parallel_cross_entropy(
+        self, mod_or_fn: Union[nn.CrossEntropyLoss, F.cross_entropy], parallel_ctx: ParallelExecutionCtx
+    ):
+        return super().create_parallel_cross_entropy(mod_or_fn, parallel_ctx)
+
     def post_process(self, graph_module: GraphModule, ctx: "ParallelExecutionCtx", config: "Config") -> nn.Module:
         """
         Initialize or load parameters from checkpoint, and tie them if needed.
diff --git a/optimum/fx/parallelization/backend/nanotron.py b/optimum/fx/parallelization/backend/nanotron.py
@@ -15,10 +15,11 @@
 # Nanotron specific imports
 import importlib.util
 from collections import defaultdict
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.fx import GraphModule
 
 from ..core import Config, ParallelExecutionCtx, ParameterMeta
@@ -149,6 +150,11 @@ def create_parallel_embedding(
             contiguous_chunks=contiguous_chunks,
         )
 
+    def create_parallel_cross_entropy(
+        self, mod_or_fn: Union[nn.CrossEntropyLoss, F.cross_entropy], parallel_ctx: "ParallelExecutionCtx"
+    ):
+        return super().create_parallel_cross_entropy(mod_or_fn, parallel_ctx)
+
     def post_process(
         self, graph_module: GraphModule, parallel_ctx: "ParallelExecutionCtx", config: "Config"
     ) -> nn.Module: