Skip to content

Commit 6006b33

Browse files
Tyler Titsworthsharvil.shahsharvil10jitendra42
authored
IPEX Multinode SSH Support (#124)
Signed-off-by: Tyler Titsworth <tyler.titsworth@intel.com> Co-authored-by: sharvil.shah <sharvils@mlp-prod-clx-5669.ra.intel.com> Co-authored-by: sharvil10 <sharvil.shah@intel.com> Co-authored-by: Jitendra Patil <jitendra.patil@intel.com>
1 parent a0d7120 commit 6006b33

File tree

3 files changed

+170
-12
lines changed

3 files changed

+170
-12
lines changed

pytorch/Dockerfile

+48-11
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
8080
gcc \
8181
libgl1-mesa-glx \
8282
libglib2.0-0 \
83-
virtualenv && \
84-
apt-get clean && \
85-
rm -rf /var/lib/apt/lists/*
83+
virtualenv
8684

8785
ENV SIGOPT_PROJECT=.
8886

@@ -91,24 +89,63 @@ COPY multinode-requirements.txt .
9189

9290
RUN python -m pip install --no-cache-dir -r multinode-requirements.txt
9391

92+
ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
93+
94+
RUN apt-get install -y --no-install-recommends --fix-missing \
95+
openssh-client \
96+
openssh-server && \
97+
rm /etc/ssh/ssh_host_*_key \
98+
/etc/ssh/ssh_host_*_key.pub && \
99+
apt-get clean && \
100+
rm -rf /var/lib/apt/lists/*
101+
102+
# Allow OpenSSH to talk to containers without asking for confirmation
103+
# hadolint global ignore=SC2002
104+
RUN mkdir -p /var/run/sshd && \
105+
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
106+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
107+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
108+
94109
ARG PYTHON_VERSION
95-
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.bashrc
96110

111+
COPY generate_ssh_keys.sh .
112+
113+
# modify generate_ssh_keys to be a helper script
114+
# print how to use helper script on bash startup
115+
# Avoids loop for further execution of the startup file
116+
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.startup && \
117+
cat '/generate_ssh_keys.sh' >> ~/.startup && \
118+
rm -rf /generate_ssh_keys.sh
97119

98-
ENV I_MPI_ROOT="${I_MPI_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
99-
ENV CCL_ROOT="${CCL_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
100-
ENV FI_PROVIDER_PATH="${FI_PROVIDER_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov"
101-
ENV LIBRARY_PATH="${LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
102-
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
103-
ENV PATH="${PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/bin"
104-
ENV CPATH="${CPATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/include"
120+
# hadolint global ignore=SC3037
121+
RUN echo -e "#!/bin/bash \n\
122+
set -e \n\
123+
set -a \n\
124+
source ~/.startup \n\
125+
set +a \n\
126+
eval \"\$@\" \n\
127+
tail -f /dev/null" >> /usr/local/bin/dockerd-entrypoint.sh && \
128+
chmod +x /usr/local/bin/dockerd-entrypoint.sh
129+
130+
RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \
131+
echo 'HostKey /etc/ssh/ssh_host_rsa_key' > /var/run/sshd_config && \
132+
echo 'HostKey /etc/ssh/ssh_host_ecdsa_key' > /var/run/sshd_config && \
133+
echo 'HostKey /etc/ssh/ssh_host_ed25519_key' > /var/run/sshd_config && \
134+
echo 'AuthorizedKeysFile /etc/ssh/authorized_keys' > /var/run/sshd_config && \
135+
echo '## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time' > /var/run/sshd_config && \
136+
echo 'LogLevel DEBUG3' > /var/run/sshd_config && \
137+
echo 'UsePAM yes' > /var/run/sshd_config && \
138+
echo 'Subsystem sftp /usr/lib/openssh/sftp-server' > /var/run/sshd_config
105139

106140
RUN mkdir -p /licensing
107141

108142
RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src/oneCCL/b7d66de16e17f88caffd7c6df4cd5e12b266af84/third-party-programs.txt -O /licensing/oneccl_third_party_programs.txt && \
109143
wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/docker/third-party-programs-pytorch.txt -O /licensing/third-party-programs-pytorch.txt && \
110144
wget -q --no-check-certificate https://raw.githubusercontent.com/intel/neural-compressor/master/LICENSE -O /licensing/LICENSE
111145

146+
ENTRYPOINT ["/usr/local/bin/dockerd-entrypoint.sh"]
147+
CMD ["bash"]
148+
112149
FROM ${PYTHON_BASE} AS ipex-xpu-base
113150

114151
RUN apt-get update && \

pytorch/README.md

+94-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ docker run -it --rm \
9797
--net=host \
9898
-v $PWD/workspace:/workspace \
9999
-w /workspace \
100-
intel/intel-extension-for-tensorflow:xpu-jupyter
100+
intel/intel-extension-for-pytorch:xpu-jupyter
101101
```
102102

103103
After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server.
@@ -113,6 +113,99 @@ The images below additionally include [Intel® oneAPI Collective Communications
113113
| `2.1.0-pip-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] |
114114
| `2.0.0-pip-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] |
115115

116+
> **Note:** Passwordless SSH connection is also enabled in the image.
117+
> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/root/.ssh/id_rsa.pub`.
118+
> User also need to append content of id_rsa.pub in `/etc/ssh/authorized_keys` in the SSH server container.
119+
> Since the SSH key is not owned by default user account in docker, please also do "chmod 644 id_rsa.pub; chmod 644 id_rsa" to grant read access for default user account.
120+
> Users could also use "/usr/bin/ssh-keygen -t rsa -b 4096 -N '' -f ~/mnt/ssh_key/id_rsa" to generate a new SSH Key inside the container.
121+
> Users need to mount a config file to list all hostnames at location `/root/.ssh/config` on the SSH client container.
122+
> Once all files are added
123+
124+
#### Setup and Run IPEX Multi-Node Container
125+
126+
Some additional assembly is required to utilize this container with OpenSSH. To perform any kind of DDP (Distributed Data Parallel) execution, containers are assigned the roles of launcher and worker respectively:
127+
128+
SSH Server (Worker)
129+
130+
1. *Authorized Keys* : `/etc/ssh/authorized_keys`
131+
132+
SSH Client (Launcher)
133+
134+
1. *Config File with Host IPs* : `/root/.ssh/config`
135+
2. *Private User Key* : `/root/.ssh/id_rsa`
136+
137+
To add these files correctly please follow the steps described below.
138+
139+
1. Setup ID Keys
140+
141+
You can use the commands provided below to [generate the Identity keys](https://www.ssh.com/academy/ssh/keygen#creating-an-ssh-key-pair-for-user-authentication) for OpenSSH.
142+
143+
```bash
144+
ssh-keygen -q -N "" -t rsa -b 4096 -f ./id_rsa
145+
touch authorized_keys
146+
cat id_rsa.pub >> authorized_keys
147+
```
148+
149+
2. Add hosts to config
150+
151+
The launcher container needs to have the a config file with all hostnames and ports specified. An example of a hostfile is provided below.
152+
153+
```bash
154+
touch config
155+
```
156+
157+
```txt
158+
Host host1
159+
HostName <Hostname of host1>
160+
IdentitiesOnly yes
161+
Port <SSH Port>
162+
Host host2
163+
HostName <Hostname of host2>
164+
IdentitiesOnly yes
165+
Port <SSH Port>
166+
...
167+
```
168+
169+
3. Configure the permissions and ownership for all of the files you have created so far.
170+
171+
```bash
172+
chmod 600 id_rsa.pub id_rsa config authorized_keys
173+
chown root:root id_rsa.pub id_rsa config authorized_keys
174+
```
175+
176+
4. Now start the workers and execute DDP on the launcher.
177+
178+
1. Worker run command:
179+
180+
```bash
181+
export SSH_PORT=<SSH Port>
182+
docker run -it --rm \
183+
--net=host \
184+
-v $PWD/authorized_keys:/root/.ssh/authorized_keys \
185+
-v $PWD/tests:/workspace/tests \
186+
-w /workspace \
187+
-e SSH_PORT=${SSH_PORT} \
188+
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
189+
bash -c '/usr/sbin/sshd -D -p ${SSH_PORT} -f /var/run/sshd_config'
190+
```
191+
192+
2. Launcher run command:
193+
194+
```bash
195+
docker run -it --rm \
196+
--net=host \
197+
-v $PWD/id_rsa:/root/.ssh/id_rsa \
198+
-v $PWD/config:/root/.ssh/config \
199+
-v $PWD/tests:/workspace/tests \
200+
-w /workspace \
201+
-e SSH_PORT=${SSH_PORT} \
202+
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
203+
bash -c 'ipexrun cpu /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl'
204+
```
205+
206+
> [!NOTE]
207+
> [Intel MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) can be configured based on your machine settings. If the above commands do not work for you, see the documentation for how to configure based on your network.
208+
116209
---
117210

118211
The images below are [TorchServe*] with CPU Optimizations:

pytorch/generate_ssh_keys.sh

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env bash
2+
# Copyright (c) 2023 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
18+
function gen_single_key() {
19+
ALG_NAME=$1
20+
if [[ ! -f /etc/ssh/ssh_host_${ALG_NAME}_key ]]; then
21+
ssh-keygen -q -N "" -t "${ALG_NAME}" -f "/etc/ssh/ssh_host_${ALG_NAME}_key"
22+
fi
23+
}
24+
25+
gen_single_key dsa
26+
gen_single_key rsa
27+
gen_single_key ecdsa
28+
gen_single_key ed25519

0 commit comments

Comments
 (0)