Refactoring FSDP. (#1586)

AdamLouly · Adam Louly · JingyaHuang · web-flow · commit 5017d0660348 · 2023-12-26T19:13:01.000+01:00
* refactor fsdp

* add trainer

* remove hidden layers

* update dockerfile

---------

Co-authored-by: Adam Louly &lt;adamlouly@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net&gt;
Co-authored-by: JingyaHuang &lt;huang_jingya@outlook.com&gt;
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort1.16.3-cu118 b/examples/onnxruntime/training/docker/Dockerfile-ort1.16.3-cu118
@@ -65,12 +65,15 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.3 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
 RUN $PYTHON_EXE -m torch_ort.configure
 
+# https://github.com/vllm-project/vllm/issues/1726
+RUN pip uninstall nvidia-nccl-cu12 -y
+
 WORKDIR .
 
 CMD ["/bin/bash"]
diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
@@ -455,7 +455,7 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         # Wrap the model with `ORTModule`
         logger.info("Wrap ORTModule for ONNX Runtime training.")
@@ -883,7 +883,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
             return model
 
         # Distributed training using PyTorch FSDP
-        if self.fsdp is not None:
+        if self.is_fsdp_xla_enabled:
             try:
                 from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
                 from torch_xla.distributed.fsdp import checkpoint_module
diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer
@@ -65,12 +65,15 @@ RUN $PYTHON_EXE -m pip install onnx ninja
 RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION}
 
 # ORT Module
-RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.1 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
+RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.3 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html
 RUN $PYTHON_EXE -m pip install torch-ort
 ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX"
 RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
 RUN $PYTHON_EXE -m torch_ort.configure
 
+# https://github.com/vllm-project/vllm/issues/1726
+RUN pip uninstall nvidia-nccl-cu12 -y
+
 # Install Optimum
 COPY . /workspace/optimum
 RUN pip install /workspace/optimum[tests]