Deepspeed Support (#188)

Tyler Titsworth · sharvil10 · web-flow · commit 431abb6e4272 · 2024-07-08T15:58:20.000-07:00
Signed-off-by: tylertitsworth &lt;tyler.titsworth@intel.com&gt;
Signed-off-by: Tyler Titsworth &lt;tyler.titsworth@intel.com&gt;
Co-authored-by: Sharvil Shah &lt;sharvil.shah@intel.com&gt;
diff --git a/docs/scripts/matrix.py b/docs/scripts/matrix.py
@@ -76,7 +76,7 @@ def get_dependency_string(dep_type):
             py_reqs = re.sub(r"\n-(.*)", "", f.read())
             py_reqs = re.sub(r"(.*]?)(\W=)(.*)", r"\1 \3", py_reqs)
             py_reqs = re.sub(r"#(.*)", "", py_reqs)
-            py_deps = py_deps + "\n".join(py_reqs.split("\n"))
+            py_deps = py_deps + "\n" + "\n".join(py_reqs.split("\n"))
 
     return os_deps, py_deps, conda_deps
 
diff --git a/pytorch/Dockerfile b/pytorch/Dockerfile
@@ -44,7 +44,8 @@ ARG TORCHVISION_VERSION
 WORKDIR /
 COPY requirements.txt .
 
-RUN python -m pip install --no-cache-dir -r requirements.txt
+RUN python -m pip install --no-cache-dir -r requirements.txt && \
+    rm -rf requirements.txt
 
 FROM ${PYTHON_BASE} AS ipex-base-idp
 
@@ -64,7 +65,8 @@ FROM ipex-base-${PACKAGE_OPTION} AS jupyter
 WORKDIR /jupyter
 COPY jupyter-requirements.txt .
 
-RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt
+RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt && \
+    rm -rf jupyter-requirements.txt
 
 RUN mkdir -p /jupyter/ && chmod -R a+rwx /jupyter/
 RUN mkdir /.local && chmod a+rwx /.local
@@ -78,8 +80,11 @@ FROM ipex-base-${PACKAGE_OPTION} AS multinode
 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
     python3-dev \
     gcc \
+    g++ \
     libgl1-mesa-glx \
     libglib2.0-0 \
+    libopenmpi-dev \
+    numactl \
     virtualenv
 
 ENV SIGOPT_PROJECT=.
@@ -88,6 +93,8 @@ WORKDIR /
 COPY multinode/requirements.txt requirements.txt
 
 RUN python -m pip install --no-cache-dir -r requirements.txt && \
+    DS_BUILD_OPS=1 python -m pip install --no-cache-dir deepspeed==0.14.4 && \
+    echo "Y" | pip uninstall nvidia-ml-py && \
     rm -rf requirements.txt
 
 ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
@@ -109,7 +116,10 @@ COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh
 # modify generate_ssh_keys to be a helper script
 # print how to use helper script on bash startup
 # Avoids loop for further execution of the startup file
-RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
+ARG PACKAGE_OPTION=pip
+ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages"
+RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \
+    echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
     cat '/generate_ssh_keys.sh' >> ~/.startup && \
     rm -rf /generate_ssh_keys.sh
 
@@ -124,7 +134,6 @@ RUN wget -q  --no-check-certificate https://raw.githubusercontent.com/oneapi-src
     wget -q  --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE
 
 ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
-CMD ["bash"]
 
 FROM ${PYTHON_BASE} AS ipex-xpu-base
 
diff --git a/pytorch/README.md b/pytorch/README.md
@@ -237,7 +237,10 @@ To add these files correctly please follow the steps described below.
         ```
 
 > [!NOTE]
-> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.
+> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.
+
+> [!TIP]
+> Additionally, [DeepSpeed*] optimizations can be utilized in place of ipexrun with the `ccl` backend for multi-node training.
 
 ---
 
@@ -331,12 +334,14 @@ It is the image user's responsibility to ensure that any use of The images below
 [Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html
 [Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html
 
+[Intel® MPI]: (https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html)
 [Intel® Extension for PyTorch*]: https://intel.github.io/intel-extension-for-pytorch/
 [Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
 [Intel® oneAPI Collective Communications Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html
 [INC]: https://github.com/intel/neural-compressor
 [PyTorch*]: https://pytorch.org/
 [TorchServe*]: https://github.com/pytorch/serve
+[DeepSpeed*]: https://github.com/microsoft/DeepSpeed
 
 [v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/main/pytorch/Dockerfile
 [v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/pytorch/Dockerfile
diff --git a/pytorch/docker-compose.yaml b/pytorch/docker-compose.yaml
@@ -77,18 +77,20 @@ services:
         dependency.apt.libglib2: true
         dependency.apt.python3-dev: true
         dependency.pip.apt.virtualenv: true
+        dependency.pip.deepspeed: 0.14.4
         dependency.python.pip: multinode/requirements.txt
         org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base"
         org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image"
         org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode
       target: multinode
     command: >
-      sh -c "python -c 'import neural_compressor;import
-      oneccl_bindings_for_pytorch as oneccl; print(\"Neural Compressor
-      Version:\", neural_compressor.__version__, \"\\nOneCCL:\",
-      oneccl.__version__)'"
+      bash -c "python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed;
+      print(\"Neural Compressor:\", neural_compressor.__version__,
+      \"\\nOneCCL:\", oneccl.__version__,
+      \"\\nDeepspeed:\", deepspeed.__version__)'"
     extends: ipex-base
     image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
+    shm_size: 2gb
   xpu:
     build:
       args:
diff --git a/pytorch/multinode/requirements.txt b/pytorch/multinode/requirements.txt
@@ -1,3 +1,5 @@
-oneccl_bind_pt==2.3.0+cpu
--f https://developer.intel.com/ipex-whl-stable-cpu
 neural-compressor==2.6
+oneccl_bind_pt==2.3.0+cpu
+--extra-index-url https://developer.intel.com/ipex-whl-stable-cpu
+oneccl-devel>=2021.13.0 # required to build deepspeed ops
+mpi4py>=3.1.0 # required to build deepspeed ops
diff --git a/pytorch/tests/ipex-resnet50.py b/pytorch/tests/ipex-resnet50.py
@@ -30,19 +30,27 @@
 parser.add_argument("--device", default="cpu", choices=["cpu", "xpu"])
 parser.add_argument("--ipex", action="store_true")
 parser.add_argument("--backend", default="gloo", choices=["gloo", "ccl"])
+parser.add_argument("--deepspeed", action="store_true")
 args = parser.parse_args()
 
 try:
     import oneccl_bindings_for_pytorch
 except:
     pass
 
-dist.init_process_group(
-    backend=args.backend,
-    init_method=init_method,
-    world_size=int(os.environ.get("WORLD_SIZE")),
-    rank=int(os.environ.get("RANK")),
-)
+if args.deepspeed:
+    import deepspeed
+
+    deepspeed.init_distributed(
+        deepspeed.accelerator.get_accelerator().communication_backend_name()
+    )
+else:
+    dist.init_process_group(
+        backend=args.backend,
+        init_method=init_method,
+        world_size=int(os.environ.get("WORLD_SIZE")),
+        rank=int(os.environ.get("RANK")),
+    )
 
 model = models.resnet50(pretrained=False)
 
diff --git a/pytorch/tests/tests.yaml b/pytorch/tests/tests.yaml
@@ -26,10 +26,14 @@ import-xpu-jupyter-${PACKAGE_OPTION:-pip}:
   cmd: python -m jupyter --version
 import-cpu-oneccl-${PACKAGE_OPTION:-pip}:
   img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
-  cmd: python -c "import oneccl_bindings_for_pytorch as oneccl; print(f'oneccl {oneccl.__version__}')"
+  cmd: python -c "'import oneccl_bindings_for_pytorch as oneccl;print(oneccl.__version__)'"
 import-cpu-inc-${PACKAGE_OPTION:-pip}:
   img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
-  cmd: python -c "import neural_compressor as inc;print(inc.__version__)"
+  cmd: python -c "'import neural_compressor as inc;print(inc.__version__)'"
+import-cpu-deepspeed-${PACKAGE_OPTION:-pip}:
+  img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
+  cmd: ds_report
+  shm_size: 2gb
 ipex-cpu-${PACKAGE_OPTION:-pip}:
   img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base
   cmd: python /tests/ipex-resnet50.py --ipex --device cpu --backend gloo
@@ -49,6 +53,14 @@ ipex-xpu-jupyter-${PACKAGE_OPTION:-pip}:
 oneccl-${PACKAGE_OPTION:-pip}:
   img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
   cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl
+  privileged: true
+  volumes:
+  - dst: /tests
+    src: $PWD/pytorch/tests
+oneccl-ds-${PACKAGE_OPTION:-pip}:
+  img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
+  cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed
+  privileged: true
   volumes:
   - dst: /tests
     src: $PWD/pytorch/tests