Skip to content

Commit 86bf281

Browse files
authored
Merge branch 'main' into sramakr1/apptainer_python
2 parents ad5561e + 0a448bf commit 86bf281

19 files changed

+145
-76
lines changed

.github/workflows/container-ci.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ jobs:
119119
uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1
120120
with:
121121
egress-policy: audit
122-
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
122+
- uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
123123
with:
124124
path: matrix
125125
- name: Set Matrix

.github/workflows/lint.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
3838
with:
3939
fetch-depth: 0
40-
- uses: super-linter/super-linter/slim@88ea3923a7e1f89dd485d079f6eb5f5e8f937589 # v6.6.0
40+
- uses: super-linter/super-linter/slim@3fe03abab2eafb293ace16d4a3b07aeabcb3f1a0 # v6.7.0
4141
env:
4242
# To report GitHub Actions status checks
4343
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/scorecard.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
results_format: sarif
4949
repo_token: ${{ secrets.GITHUB_TOKEN }}
5050
publish_results: true
51-
- uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
51+
- uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
5252
with:
5353
name: SARIF file
5454
path: results.sarif

.github/workflows/security-report.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
sarifReportDir: ${{ github.workspace }}
3636
template: report
3737
token: ${{ secrets.GITHUB_TOKEN }}
38-
- uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
38+
- uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4
3939
with:
4040
name: Security Report Summary
4141
path: ./*.pdf

.github/workflows/test-runner-ci.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
with:
3838
egress-policy: audit
3939
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
40-
- uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0
40+
- uses: docker/setup-buildx-action@4fd812986e6c8c2a69e18311145f9371337f27d4 # v3.4.0
4141
with:
4242
driver: docker
4343
- uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 # v3.2.0
@@ -80,7 +80,7 @@ jobs:
8080
with:
8181
egress-policy: audit
8282
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
83-
- uses: docker/setup-buildx-action@d70bba72b1f3fd22344832f00baa16ece964efeb # v3.3.0
83+
- uses: docker/setup-buildx-action@4fd812986e6c8c2a69e18311145f9371337f27d4 # v3.4.0
8484
with:
8585
driver: docker
8686
- uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446 # v3.2.0

docs/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
mkdocs-callouts>=1.13.2
22
mkdocs-git-authors-plugin>=0.8.0
33
mkdocs-git-revision-date-localized-plugin>=1.2.5
4-
mkdocs-material==9.5.27
4+
mkdocs-material==9.5.28
55
mkdocs-table-reader-plugin>=2.1.0
66
mkdocs==1.6.0
77
pandas>=2.0.3

preset/deep-learning/Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,8 @@ RUN apt-get install -y --no-install-recommends --fix-missing \
421421
/etc/ssh/ssh_host_*_key.pub && \
422422
rm -rf /var/lib/apt/lists/*
423423

424-
RUN mkdir -p /var/run/sshd
424+
RUN mkdir -p /var/run/sshd && \
425+
echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config
425426

426427
# https://github.com/openucx/ucx/issues/4742#issuecomment-584059909
427428
ENV UCX_TLS=ud,sm,self

pytorch/Dockerfile

+8-27
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,10 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
8585
ENV SIGOPT_PROJECT=.
8686

8787
WORKDIR /
88-
COPY multinode-requirements.txt .
88+
COPY multinode/requirements.txt requirements.txt
8989

90-
RUN python -m pip install --no-cache-dir -r multinode-requirements.txt
90+
RUN python -m pip install --no-cache-dir -r requirements.txt && \
91+
rm -rf requirements.txt
9192

9293
ENV LD_LIBRARY_PATH="/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
9394

@@ -99,16 +100,11 @@ RUN apt-get install -y --no-install-recommends --fix-missing \
99100
apt-get clean && \
100101
rm -rf /var/lib/apt/lists/*
101102

102-
# Allow OpenSSH to talk to containers without asking for confirmation
103-
# hadolint global ignore=SC2002
104-
RUN mkdir -p /var/run/sshd && \
105-
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
106-
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
107-
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
103+
RUN mkdir -p /var/run/sshd
108104

109105
ARG PYTHON_VERSION
110106

111-
COPY generate_ssh_keys.sh .
107+
COPY multinode/generate_ssh_keys.sh /generate_ssh_keys.sh
112108

113109
# modify generate_ssh_keys to be a helper script
114110
# print how to use helper script on bash startup
@@ -117,24 +113,9 @@ RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bin
117113
cat '/generate_ssh_keys.sh' >> ~/.startup && \
118114
rm -rf /generate_ssh_keys.sh
119115

120-
# hadolint global ignore=SC3037
121-
RUN echo -e "#!/bin/bash \n\
122-
set -e \n\
123-
set -a \n\
124-
source ~/.startup \n\
125-
set +a \n\
126-
eval \"\$@\"" >> /usr/local/bin/dockerd-entrypoint.sh && \
127-
chmod +x /usr/local/bin/dockerd-entrypoint.sh
128-
129-
RUN echo 'HostKey /etc/ssh/ssh_host_dsa_key' > /var/run/sshd_config && \
130-
echo 'HostKey /etc/ssh/ssh_host_rsa_key' > /var/run/sshd_config && \
131-
echo 'HostKey /etc/ssh/ssh_host_ecdsa_key' > /var/run/sshd_config && \
132-
echo 'HostKey /etc/ssh/ssh_host_ed25519_key' > /var/run/sshd_config && \
133-
echo 'AuthorizedKeysFile /etc/ssh/authorized_keys' > /var/run/sshd_config && \
134-
echo '## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time' > /var/run/sshd_config && \
135-
echo 'LogLevel DEBUG3' > /var/run/sshd_config && \
136-
echo 'UsePAM yes' > /var/run/sshd_config && \
137-
echo 'Subsystem sftp /usr/lib/openssh/sftp-server' > /var/run/sshd_config
116+
COPY multinode/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
117+
COPY multinode/sshd_config /etc/ssh/sshd_config
118+
COPY multinode/ssh_config /etc/ssh/ssh_config
138119

139120
RUN mkdir -p /licensing
140121

pytorch/README.md

+85-37
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,8 @@ The images below additionally include [Jupyter Notebook](https://jupyter.org/) s
4545

4646
| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile |
4747
| --------------------- | -------- | ------------- | ------ | ------------ | --------------- |
48-
| `xpu-jupyter` | [v2.1.0] | [v2.1.30+xpu] | [803] | `8888` | [v0.4.0-Beta] |
49-
| `2.1.20-xpu-pip-base` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] |
50-
| `2.1.10-xpu-pip-base` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.3.4] |
48+
| `2.1.20-xpu-pip-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] |
49+
| `2.1.10-xpu-pip-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] |
5150

5251
### Run the XPU Jupyter Container
5352

@@ -58,7 +57,7 @@ docker run -it --rm \
5857
--device /dev/dri \
5958
-v /dev/dri/by-path:/dev/dri/by-path \
6059
--ipc=host \
61-
intel/intel-extension-for-pytorch:xpu-jupyter
60+
intel/intel-extension-for-pytorch:2.1.20-xpu-pip-jupyter
6261
```
6362

6463
After running the command above, copy the URL (something like `http://127.0.0.1:$PORT/?token=***`) into your browser to access the notebook server.
@@ -114,12 +113,8 @@ The images below additionally include [Intel® oneAPI Collective Communications
114113
| `2.0.0-pip-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] |
115114

116115
> **Note:** Passwordless SSH connection is also enabled in the image.
117-
> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/root/.ssh/id_rsa.pub`.
118-
> User also need to append content of id_rsa.pub in `/etc/ssh/authorized_keys` in the SSH server container.
119-
> Since the SSH key is not owned by default user account in docker, please also do "chmod 644 id_rsa.pub; chmod 644 id_rsa" to grant read access for default user account.
120-
> Users could also use "/usr/bin/ssh-keygen -t rsa -b 4096 -N '' -f ~/mnt/ssh_key/id_rsa" to generate a new SSH Key inside the container.
121-
> Users need to mount a config file to list all hostnames at location `/root/.ssh/config` on the SSH client container.
122-
> Once all files are added
116+
> The container does not contain the SSH ID keys. The user needs to mount those keys at `/root/.ssh/id_rsa` and `/etc/ssh/authorized_keys`.
117+
> Since the SSH key is not owned by default user account in docker, please also do "chmod 600 authorized_keys; chmod 600 id_rsa" to grant read access for default user account.
123118
124119
#### Setup and Run IPEX Multi-Node Container
125120

@@ -131,8 +126,7 @@ SSH Server (Worker)
131126

132127
SSH Client (Launcher)
133128

134-
1. *Config File with Host IPs* : `/root/.ssh/config`
135-
2. *Private User Key* : `/root/.ssh/id_rsa`
129+
1. *Private User Key* : `/root/.ssh/id_rsa`
136130

137131
To add these files correctly please follow the steps described below.
138132

@@ -146,47 +140,33 @@ To add these files correctly please follow the steps described below.
146140
cat id_rsa.pub >> authorized_keys
147141
```
148142

149-
2. Add hosts to config
150-
151-
The launcher container needs to have the a config file with all hostnames and ports specified. An example of a hostfile is provided below.
143+
2. Configure the permissions and ownership for all of the files you have created so far.
152144

153145
```bash
154-
touch config
146+
chmod 600 id_rsa config authorized_keys
147+
chown root:root id_rsa.pub id_rsa config authorized_keys
155148
```
156149

150+
3. Setup hostfile. The hostfile is needed for running torch distributed using `ipexrun` utility. If you're not using `ipexrun` you can skip this step.
151+
157152
```txt
158-
Host host1
159-
HostName <Hostname of host1>
160-
IdentitiesOnly yes
161-
Port <SSH Port>
162-
Host host2
163-
HostName <Hostname of host2>
164-
IdentitiesOnly yes
165-
Port <SSH Port>
153+
<Host 1 IP/Hostname>
154+
<Host 2 IP/Hostname>
166155
...
167156
```
168157
169-
3. Configure the permissions and ownership for all of the files you have created so far.
170-
171-
```bash
172-
chmod 600 id_rsa.pub id_rsa config authorized_keys
173-
chown root:root id_rsa.pub id_rsa config authorized_keys
174-
```
175-
176158
4. Now start the workers and execute DDP on the launcher.
177159
178160
1. Worker run command:
179161
180162
```bash
181-
export SSH_PORT=<SSH Port>
182163
docker run -it --rm \
183164
--net=host \
184-
-v $PWD/authorized_keys:/root/.ssh/authorized_keys \
165+
-v $PWD/authorized_keys:/etc/ssh/authorized_keys \
185166
-v $PWD/tests:/workspace/tests \
186167
-w /workspace \
187-
-e SSH_PORT=${SSH_PORT} \
188168
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
189-
bash -c '/usr/sbin/sshd -D -p ${SSH_PORT} -f /var/run/sshd_config'
169+
bash -c '/usr/sbin/sshd -D'
190170
```
191171
192172
2. Launcher run command:
@@ -195,12 +175,65 @@ To add these files correctly please follow the steps described below.
195175
docker run -it --rm \
196176
--net=host \
197177
-v $PWD/id_rsa:/root/.ssh/id_rsa \
198-
-v $PWD/config:/root/.ssh/config \
199178
-v $PWD/tests:/workspace/tests \
179+
-v $PWD/hostfile:/workspace/hostfile \
200180
-w /workspace \
181+
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
182+
bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port 3022 /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl'
183+
```
184+
185+
5. Start SSH server with a custom port.
186+
If the user wants to define their own port to start the SSH server, it can be done so using the commands described below.
187+
188+
1. Worker command:
189+
190+
```bash
191+
export SSH_PORT=<User SSH Port>
192+
docker run -it --rm \
193+
--net=host \
194+
-v $PWD/authorized_keys:/etc/ssh/authorized_keys \
195+
-v $PWD/tests:/workspace/tests \
201196
-e SSH_PORT=${SSH_PORT} \
197+
-w /workspace \
202198
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
203-
bash -c 'ipexrun cpu /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl'
199+
bash -c '/usr/sbin/sshd -D -p ${SSH_PORT}'
200+
```
201+
202+
2. Add hosts to config. (**Note:** This is an optional step)
203+
204+
User can optionally mount their own custom client config file to define a list of hosts and ports where the SSH server is running inside the container. An example of a hostfile is provided below. This file is supposed to be mounted in the launcher container at `/etc/ssh/ssh_config`.
205+
206+
```bash
207+
touch config
208+
```
209+
210+
```txt
211+
Host host1
212+
HostName <Hostname of host1>
213+
IdentitiesOnly yes
214+
IdentityFile ~/.root/id_rsa
215+
Port <SSH Port>
216+
Host host2
217+
HostName <Hostname of host2>
218+
IdentitiesOnly yes
219+
IdentityFile ~/.root/id_rsa
220+
Port <SSH Port>
221+
...
222+
```
223+
224+
3. Launcher run command:
225+
226+
```bash
227+
docker run -it --rm \
228+
--net=host \
229+
-v $PWD/id_rsa:/root/.ssh/id_rsa \
230+
-v $PWD/config:/etc/ssh/ssh_config \
231+
-v $PWD/hostfile:/workspace/hostfile \
232+
-v $PWD/tests:/workspace/tests \
233+
-e SSH_PORT=${SSH_PORT} \
234+
-w /workspace \
235+
intel/intel-extension-for-pytorch:2.3.0-pip-multinode \
236+
bash -c 'ipexrun cpu --nnodes 2 --nprocs-per-node 1 --master-addr 127.0.0.1 --master-port ${SSH_PORT} /workspace/tests/ipex-resnet50.py --ipex --device cpu --backend ccl'
204237
```
205238
206239
> [!NOTE]
@@ -246,6 +279,21 @@ The images below additionally include [Intel® oneAPI Collective Communications
246279
| `2.1.0-idp-mulitnode` | [v2.1.0] | [v2.1.0+cpu] | [v2.1.0][ccl-v2.1.0] | [v2.3.1] | [v0.2.3] |
247280
| `2.0.0-idp-multinode` | [v2.0.0] | [v2.0.0+cpu] | [v2.0.0][ccl-v2.0.0] | [v2.1.1] | [v0.1.0] |
248281
282+
## XPU images with Intel® Distribution for Python*
283+
284+
The images below are built only with CPU and GPU optimizations and include [Intel® Distribution for Python*]:
285+
286+
| Tag(s) | Pytorch | IPEX | Driver | Dockerfile |
287+
| ---------------- | -------- | ------------ | -------- | ------ |
288+
| `2.1.10-xpu-idp-base` | [v2.1.0] | [v2.1.10+xpu] | [736] | [v0.2.3] |
289+
290+
The images below additionally include [Jupyter Notebook](https://jupyter.org/) server:
291+
292+
| Tag(s) | Pytorch | IPEX | Driver | Jupyter Port | Dockerfile |
293+
| --------------------- | -------- | ------------- | ------ | ------------ | --------------- |
294+
| `2.1.20-xpu-idp-jupyter` | [v2.1.0] | [v2.1.20+xpu] | [803] | `8888` | [v0.3.4] |
295+
| `2.1.10-xpu-idp-jupyter` | [v2.1.0] | [v2.1.10+xpu] | [736] | `8888` | [v0.2.3] |
296+
249297
## Build from Source
250298
251299
To build the images from source, clone the [Intel® AI Containers](https://github.com/intel/ai-containers) repository, follow the main `README.md` file to setup your environment, and run the following command:

pytorch/docker-compose.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ services:
7777
dependency.apt.libglib2: true
7878
dependency.apt.python3-dev: true
7979
dependency.pip.apt.virtualenv: true
80-
dependency.python.pip: multinode-requirements.txt
80+
dependency.python.pip: multinode/requirements.txt
8181
org.opencontainers.base.name: "intel/intel-optimized-pytorch:${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-base"
8282
org.opencontainers.image.title: "Intel® Extension for PyTorch MultiNode Image"
8383
org.opencontainers.image.version: ${IPEX_VERSION:-2.2.0}-${PACKAGE_OPTION:-pip}-multinode
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# Copyright (c) 2024 Intel Corporation
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -e
17+
set -a
18+
# shellcheck disable=SC1091
19+
source "$HOME/.startup"
20+
set +a
21+
"$@"
File renamed without changes.
File renamed without changes.

pytorch/multinode/ssh_config

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Host *
2+
Port 3022
3+
IdentityFile ~/.ssh/id_rsa
4+
StrictHostKeyChecking no

pytorch/multinode/sshd_config

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
HostKey /etc/ssh/ssh_host_dsa_key
2+
HostKey /etc/ssh/ssh_host_rsa_key
3+
HostKey /etc/ssh/ssh_host_ecdsa_key
4+
HostKey /etc/ssh/ssh_host_ed25519_key
5+
AuthorizedKeysFile /etc/ssh/authorized_keys
6+
## Enable DEBUG log. You can ignore this but this may help you debug any issue while enabling SSHD for the first time
7+
LogLevel DEBUG3
8+
Port 3022
9+
UsePAM yes
10+
Subsystem sftp /usr/lib/openssh/sftp-server
11+
# https://ubuntu.com/security/CVE-2024-6387
12+
LoginGraceTime 0

tensorflow/Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
106106
ENV OMPI_MCA_tl_tcp_if_exclude="lo,docker0"
107107

108108
# Install OpenSSH for MPI to communicate between containers
109-
RUN mkdir -p /var/run/sshd
109+
RUN mkdir -p /var/run/sshd && \
110+
echo 'LoginGraceTime 0' >> /etc/ssh/sshd_config
110111

111112
# Install Horovod
112113
ARG HOROVOD_WITH_TENSORFLOW=1

test-runner/dev-requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ coverage>=7.5.0
33
coveralls>=4.0.1
44
expandvars>=0.12.0
55
hypothesis>=6.100.1
6-
pydantic==2.7.4
6+
pydantic==2.8.2
77
pylint>=3.1.0
88
pytest>=8.1.1
99
python_on_whales>=0.70.1

0 commit comments

Comments
 (0)