Skip to content

Commit 431abb6

Browse files
Tyler Titsworthsharvil10
Tyler Titsworth
andauthored
Deepspeed Support (#188)
Signed-off-by: tylertitsworth <tyler.titsworth@intel.com> Signed-off-by: Tyler Titsworth <tyler.titsworth@intel.com> Co-authored-by: Sharvil Shah <sharvil.shah@intel.com>
1 parent 51a8b0c commit 431abb6

File tree

7 files changed

+58
-20
lines changed

7 files changed

+58
-20
lines changed

docs/scripts/matrix.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def get_dependency_string(dep_type):
7676
py_reqs = re.sub(r"\n-(.*)", "", f.read())
7777
py_reqs = re.sub(r"(.*]?)(\W=)(.*)", r"\1 \3", py_reqs)
7878
py_reqs = re.sub(r"#(.*)", "", py_reqs)
79-
py_deps = py_deps + "\n".join(py_reqs.split("\n"))
79+
py_deps = py_deps + "\n" + "\n".join(py_reqs.split("\n"))
8080

8181
return os_deps, py_deps, conda_deps
8282

pytorch/Dockerfile

+13-4
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ ARG TORCHVISION_VERSION
4444
WORKDIR /
4545
COPY requirements.txt .
4646

47-
RUN python -m pip install --no-cache-dir -r requirements.txt
47+
RUN python -m pip install --no-cache-dir -r requirements.txt && \
48+
rm -rf requirements.txt
4849

4950
FROM ${PYTHON_BASE} AS ipex-base-idp
5051

@@ -64,7 +65,8 @@ FROM ipex-base-${PACKAGE_OPTION} AS jupyter
6465
WORKDIR /jupyter
6566
COPY jupyter-requirements.txt .
6667

67-
RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt
68+
RUN python -m pip install --no-cache-dir -r jupyter-requirements.txt && \
69+
rm -rf jupyter-requirements.txt
6870

6971
RUN mkdir -p /jupyter/ && chmod -R a+rwx /jupyter/
7072
RUN mkdir /.local && chmod a+rwx /.local
@@ -78,8 +80,11 @@ FROM ipex-base-${PACKAGE_OPTION} AS multinode
7880
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
7981
python3-dev \
8082
gcc \
83+
g++ \
8184
libgl1-mesa-glx \
8285
libglib2.0-0 \
86+
libopenmpi-dev \
87+
numactl \
8388
virtualenv
8489

8590
ENV SIGOPT_PROJECT=.
@@ -88,6 +93,8 @@ WORKDIR /
8893
COPY multinode/requirements.txt requirements.txt
8994

9095
RUN python -m pip install --no-cache-dir -r requirements.txt && \
96+
DS_BUILD_OPS=1 python -m pip install --no-cache-dir deepspeed==0.14.4 && \
97+
echo "Y" | pip uninstall nvidia-ml-py && \
9198
rm -rf requirements.txt
9299

93100
ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
@@ -109,7 +116,10 @@ COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh
109116
# modify generate_ssh_keys to be a helper script
110117
# print how to use helper script on bash startup
111118
# Avoids loop for further execution of the startup file
112-
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
119+
ARG PACKAGE_OPTION=pip
120+
ARG PYPATH="/usr/local/lib/python${PYTHON_VERSION}/dist-packages"
121+
RUN if [ "${PACKAGE_OPTION}" = "idp" ]; then PYPATH="/opt/conda/envs/idp/lib/python${PYTHON_VERSION}/site-packages"; fi && \
122+
echo "source ${PYPATH}/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
113123
cat '/generate_ssh_keys.sh' >> ~/.startup && \
114124
rm -rf /generate_ssh_keys.sh
115125

@@ -124,7 +134,6 @@ RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src
124134
wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE
125135

126136
ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
127-
CMD ["bash"]
128137

129138
FROM ${PYTHON_BASE} AS ipex-xpu-base
130139

pytorch/README.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,10 @@ To add these files correctly please follow the steps described below.
237237
```
238238
239239
> [!NOTE]
240-
> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.
240+
> [Intel® MPI] can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.
241+
242+
> [!TIP]
243+
> Additionally, [DeepSpeed*] optimizations can be utilized in place of ipexrun with the `ccl` backend for multi-node training.
241244
242245
---
243246
@@ -331,12 +334,14 @@ It is the image user's responsibility to ensure that any use of The images below
331334
[Intel® Data Center GPU Flex Series]: https://ark.intel.com/content/www/us/en/ark/products/series/230021/intel-data-center-gpu-flex-series.html
332335
[Intel® Data Center GPU Max Series]: https://ark.intel.com/content/www/us/en/ark/products/series/232874/intel-data-center-gpu-max-series.html
333336

337+
[Intel® MPI]: (https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html)
334338
[Intel® Extension for PyTorch*]: https://intel.github.io/intel-extension-for-pytorch/
335339
[Intel® Distribution for Python*]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/distribution-for-python.html
336340
[Intel® oneAPI Collective Communications Library]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html
337341
[INC]: https://github.com/intel/neural-compressor
338342
[PyTorch*]: https://pytorch.org/
339343
[TorchServe*]: https://github.com/pytorch/serve
344+
[DeepSpeed*]: https://github.com/microsoft/DeepSpeed
340345

341346
[v0.4.0-Beta]: https://github.com/intel/ai-containers/blob/main/pytorch/Dockerfile
342347
[v0.3.4]: https://github.com/intel/ai-containers/blob/v0.3.4/pytorch/Dockerfile

pytorch/docker-compose.yaml

+6-4
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,20 @@ services:
7777
dependency.apt.libglib2: true
7878
dependency.apt.python3-dev: true
7979
dependency.pip.apt.virtualenv: true
80+
dependency.pip.deepspeed: 0.14.4
8081
dependency.python.pip: multinode/requirements.txt
8182
org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base"
8283
org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image"
8384
org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode
8485
target: multinode
8586
command: >
86-
sh -c "python -c 'import neural_compressor;import
87-
oneccl_bindings_for_pytorch as oneccl; print(\"Neural Compressor
88-
Version:\", neural_compressor.__version__, \"\\nOneCCL:\",
89-
oneccl.__version__)'"
87+
bash -c "python -c 'import neural_compressor;import oneccl_bindings_for_pytorch as oneccl;import deepspeed;
88+
print(\"Neural Compressor:\", neural_compressor.__version__,
89+
\"\\nOneCCL:\", oneccl.__version__,
90+
\"\\nDeepspeed:\", deepspeed.__version__)'"
9091
extends: ipex-base
9192
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
93+
shm_size: 2gb
9294
xpu:
9395
build:
9496
args:

pytorch/multinode/requirements.txt

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1-
oneccl_bind_pt==2.3.0+cpu
2-
-f https://developer.intel.com/ipex-whl-stable-cpu
31
neural-compressor==2.6
2+
oneccl_bind_pt==2.3.0+cpu
3+
--extra-index-url https://developer.intel.com/ipex-whl-stable-cpu
4+
oneccl-devel>=2021.13.0 # required to build deepspeed ops
5+
mpi4py>=3.1.0 # required to build deepspeed ops

pytorch/tests/ipex-resnet50.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,27 @@
3030
parser.add_argument("--device", default="cpu", choices=["cpu", "xpu"])
3131
parser.add_argument("--ipex", action="store_true")
3232
parser.add_argument("--backend", default="gloo", choices=["gloo", "ccl"])
33+
parser.add_argument("--deepspeed", action="store_true")
3334
args = parser.parse_args()
3435

3536
try:
3637
import oneccl_bindings_for_pytorch
3738
except:
3839
pass
3940

40-
dist.init_process_group(
41-
backend=args.backend,
42-
init_method=init_method,
43-
world_size=int(os.environ.get("WORLD_SIZE")),
44-
rank=int(os.environ.get("RANK")),
45-
)
41+
if args.deepspeed:
42+
import deepspeed
43+
44+
deepspeed.init_distributed(
45+
deepspeed.accelerator.get_accelerator().communication_backend_name()
46+
)
47+
else:
48+
dist.init_process_group(
49+
backend=args.backend,
50+
init_method=init_method,
51+
world_size=int(os.environ.get("WORLD_SIZE")),
52+
rank=int(os.environ.get("RANK")),
53+
)
4654

4755
model = models.resnet50(pretrained=False)
4856

pytorch/tests/tests.yaml

+14-2
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,14 @@ import-xpu-jupyter-${PACKAGE_OPTION:-pip}:
2626
cmd: python -m jupyter --version
2727
import-cpu-oneccl-${PACKAGE_OPTION:-pip}:
2828
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
29-
cmd: python -c "import oneccl_bindings_for_pytorch as oneccl; print(f'oneccl {oneccl.__version__}')"
29+
cmd: python -c "'import oneccl_bindings_for_pytorch as oneccl;print(oneccl.__version__)'"
3030
import-cpu-inc-${PACKAGE_OPTION:-pip}:
3131
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
32-
cmd: python -c "import neural_compressor as inc;print(inc.__version__)"
32+
cmd: python -c "'import neural_compressor as inc;print(inc.__version__)'"
33+
import-cpu-deepspeed-${PACKAGE_OPTION:-pip}:
34+
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
35+
cmd: ds_report
36+
shm_size: 2gb
3337
ipex-cpu-${PACKAGE_OPTION:-pip}:
3438
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-base
3539
cmd: python /tests/ipex-resnet50.py --ipex --device cpu --backend gloo
@@ -49,6 +53,14 @@ ipex-xpu-jupyter-${PACKAGE_OPTION:-pip}:
4953
oneccl-${PACKAGE_OPTION:-pip}:
5054
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
5155
cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl
56+
privileged: true
57+
volumes:
58+
- dst: /tests
59+
src: $PWD/pytorch/tests
60+
oneccl-ds-${PACKAGE_OPTION:-pip}:
61+
img: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-${BASE_IMAGE_NAME:-ubuntu}-${BASE_IMAGE_TAG:-22.04}-${PACKAGE_OPTION:-pip}-py${PYTHON_VERSION:-3.10}-ipex-${IPEX_VERSION:-2.3.0}-oneccl-inc-${INC_VERSION:-2.6}
62+
cmd: ipexrun cpu /tests/ipex-resnet50.py --ipex --device cpu --backend ccl --deepspeed
63+
privileged: true
5264
volumes:
5365
- dst: /tests
5466
src: $PWD/pytorch/tests

0 commit comments

Comments
 (0)