Skip to content

Commit 6e77197

Browse files
committed
Gaudi 1.18 notebook release
Signed-off-by: sharvil10 <sharvil.shah@intel.com>
1 parent 80e0b2b commit 6e77197

File tree

4 files changed

+91
-48
lines changed

4 files changed

+91
-48
lines changed

enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.2

+15-5
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ RUN dnf install -y \
5151
wget \
5252
git \
5353
libffi-devel \
54+
bzip2 \
5455
bzip2-devel \
5556
zlib-devel \
5657
mesa-libGL \
@@ -59,7 +60,8 @@ RUN dnf install -y \
5960
# update pkgs (except OS version) for resolving potentials CVEs
6061
dnf versionlock add redhat-release* && \
6162
dnf update -y && \
62-
dnf clean all && rm -rf /var/cache/yum
63+
dnf clean all && rm -rf /var/cache/yum && \
64+
rm -f /etc/ssh/ssh_host_*_key*
6365

6466
RUN mkdir -p /licenses && \
6567
wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE
@@ -72,20 +74,22 @@ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
7274
COPY install_efa.sh .
7375
RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
7476

77+
ENV OPENMPI_VERSION=4.1.6
7578
ENV LIBFABRIC_VERSION="1.20.0"
7679
ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
77-
ENV MPI_ROOT=/opt/amazon/openmpi
80+
ENV MPI_ROOT=/opt/habanalabs/openmpi
7881
ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
7982
ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
8083
ENV OPAL_PREFIX=${MPI_ROOT}
8184
ENV MPICC=${MPI_ROOT}/bin/mpicc
8285
ENV RDMAV_FORK_SAFE=1
83-
ENV FI_EFA_USE_DEVICE_RDMA=1
86+
ENV FI_EFA_USE_DEVICE_RDMA=0
87+
ENV OMPI_MCA_btl=^openib
8488

8589
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
8690
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
8791
echo "baseurl=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2" >> /etc/yum.repos.d/habanalabs.repo && \
88-
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/api/v2/repositories/rhel/keyPairs/primary/public" >> /etc/yum.repos.d/habanalabs.repo && \
92+
echo "gpgkey=https://${ARTIFACTORY_URL}/artifactory/rhel/9/9.2/repodata/repomd.xml.key" >> /etc/yum.repos.d/habanalabs.repo && \
8993
echo "gpgcheck=1" >> /etc/yum.repos.d/habanalabs.repo
9094

9195
# for Habana GPG key with SHA-1 signature
@@ -112,6 +116,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o
112116
./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \
113117
make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION}
114118

119+
RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
120+
tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
121+
cd /tmp/openmpi-${OPENMPI_VERSION} && \
122+
./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \
123+
make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}
124+
115125
RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \
116126
unzip /tmp/main.zip -d /tmp && \
117127
cd /tmp/hccl_ofi_wrapper-main && \
@@ -126,7 +136,7 @@ ENV HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins
126136

127137
ENV APP_ROOT="/opt/app-root"
128138

129-
RUN python3.10 -m pip install "pip>=23.3" "setuptools>=70.0.0" "wheel==0.38.4"
139+
RUN python3.10 -m pip install "pip==24.2" "setuptools==75.1.0" "wheel==0.44.0"
130140

131141
WORKDIR ${APP_ROOT}
132142

enterprise/redhat/openshift-ai/gaudi/docker/Dockerfile.rhel9.4

+32-13
Original file line numberDiff line numberDiff line change
@@ -49,28 +49,30 @@ RUN dnf install -y \
4949
lsof \
5050
python3-devel \
5151
openssh-clients \
52-
openssl-1:3.0.7-27.el9 \
53-
openssl-devel-1:3.0.7-27.el9 \
52+
openssl-1:3.0.7-28.el9_4 \
53+
openssl-devel-1:3.0.7-28.el9_4 \
5454
libjpeg-devel \
5555
openssh-server \
5656
lsb_release \
5757
wget \
5858
git \
5959
libffi-devel \
60+
bzip2 \
6061
bzip2-devel \
6162
zlib-devel \
6263
mesa-libGL \
6364
iproute \
6465
python3.11 \
6566
python3.11-pip \
6667
python3.11-devel \
68+
python3.11-rpm \
6769
ffmpeg-free \
68-
perl-Net-SSLeay-1.92-2.el9 \
6970
python3-dnf-plugin-versionlock && \
7071
# update pkgs (except OS version) for resolving potentials CVEs
71-
dnf versionlock add redhat-release* openssl* perl-Net-SSLeay && \
72+
dnf versionlock add redhat-release* openssl* libcurl-minimal curl-minimal ima-evm-utils python3-rpm rpm* && \
7273
dnf update -y && \
73-
dnf clean all && rm -rf /var/cache/yum
74+
dnf clean all && rm -rf /var/cache/yum && \
75+
rm -f /etc/ssh/ssh_host_*_key*
7476

7577
RUN mkdir -p /licenses && \
7678
wget -O /licenses/LICENSE https://raw.githubusercontent.com/intel/ai-containers/main/LICENSE
@@ -85,15 +87,17 @@ RUN alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 2 && \
8587
COPY install_efa.sh .
8688
RUN ./install_efa.sh && rm install_efa.sh && rm -rf /etc/ld.so.conf.d/efa.conf /etc/profile.d/efa.sh
8789

88-
ENV LIBFABRIC_VERSION="1.20.0"
90+
ENV OPENMPI_VERSION=4.1.6
91+
ENV LIBFABRIC_VERSION="1.22.0"
8992
ENV LIBFABRIC_ROOT="/opt/habanalabs/libfabric-${LIBFABRIC_VERSION}"
90-
ENV MPI_ROOT=/opt/amazon/openmpi
93+
ENV MPI_ROOT=/opt/habanalabs/openmpi
9194
ENV LD_LIBRARY_PATH=$LIBFABRIC_ROOT/lib:${MPI_ROOT}/lib:/usr/lib/habanalabs:$LD_LIBRARY_PATH
9295
ENV PATH=${LIBFABRIC_ROOT}/bin:${MPI_ROOT}/bin:$PATH
9396
ENV OPAL_PREFIX=${MPI_ROOT}
9497
ENV MPICC=${MPI_ROOT}/bin/mpicc
9598
ENV RDMAV_FORK_SAFE=1
96-
ENV FI_EFA_USE_DEVICE_RDMA=1
99+
ENV FI_EFA_USE_DEVICE_RDMA=0
100+
ENV OMPI_MCA_btl=^openib
97101

98102
RUN echo "[habanalabs]" > /etc/yum.repos.d/habanalabs.repo && \
99103
echo "name=Habana RH9 Linux repo" >> /etc/yum.repos.d/habanalabs.repo && \
@@ -125,6 +129,12 @@ RUN wget -nv -O /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 https://github.com/o
125129
./configure --prefix=$LIBFABRIC_ROOT --enable-psm3-verbs --enable-verbs=yes --with-synapseai=/usr && \
126130
make && make install && cd / && rm -rf /tmp/libfabric-${LIBFABRIC_VERSION}.tar.bz2 /tmp/libfabric-${LIBFABRIC_VERSION}
127131

132+
RUN wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OPENMPI_VERSION}.tar.gz && \
133+
tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
134+
cd /tmp/openmpi-${OPENMPI_VERSION} && \
135+
./configure --prefix=${MPI_ROOT} --with-libfabric=$LIBFABRIC_ROOT --with-verbs && \
136+
make -j$(nproc) && make install && cd / && rm -rf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz /tmp/openmpi-${OPENMPI_VERSION}
137+
128138
RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archive/refs/heads/main.zip && \
129139
unzip /tmp/main.zip -d /tmp && \
130140
cd /tmp/hccl_ofi_wrapper-main && \
@@ -134,7 +144,7 @@ RUN wget -nv -O /tmp/main.zip https://github.com/HabanaAI/hccl_ofi_wrapper/archi
134144

135145
ENV APP_ROOT="/opt/app-root"
136146

137-
RUN python3.11 -m pip install pip==23.3.1 setuptools==67.3.3 wheel==0.38.4
147+
RUN python3.11 -m pip install pip==24.2 setuptools==75.1.0 wheel==0.44.0
138148

139149
WORKDIR ${APP_ROOT}
140150

@@ -170,7 +180,7 @@ ARG ARTIFACTORY_URL
170180
ENV BASE_NAME=rhel9.4
171181

172182
LABEL name="PyTorch Installer"
173-
LABEL summary="Habanalabs PyTorch installer layer for RHEL9.2"
183+
LABEL summary="Habanalabs PyTorch installer layer for RHEL9.4"
174184
LABEL description="Image with pre installed Habanalabs packages for PyTorch"
175185

176186
RUN echo "/usr/lib/habanalabs" > $(python -c "import sysconfig; print(sysconfig.get_path('platlib'))")/habanalabs-graph.pt
@@ -184,7 +194,7 @@ RUN echo "[CRB]" > /etc/yum.repos.d/CentOS-Linux-CRB.repo && \
184194
echo "gpgcheck=1" >> /etc/yum.repos.d/CentOS-Linux-CRB.repo
185195

186196
RUN dnf install --allowerasing -y \
187-
curl \
197+
curl-7.76.1-29.el9_4.1 \
188198
cairo-devel \
189199
numactl-devel \
190200
iproute \
@@ -196,10 +206,19 @@ RUN dnf install --allowerasing -y \
196206
gperftools-devel && \
197207
dnf clean all && rm -rf /var/cache/yum
198208

199-
RUN dnf config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo -y && \
200-
dnf install --allowerasing -y intel-mkl-64bit-2020.4-912 && \
209+
RUN echo "[oneAPI]" >> /etc/yum.repos.d/oneAPI.repo && \
210+
echo "name=Intel® oneAPI repository" >> /etc/yum.repos.d/oneAPI.repo && \
211+
echo "baseurl=https://yum.repos.intel.com/oneapi" >> /etc/yum.repos.d/oneAPI.repo && \
212+
echo 'enabled=1' >> /etc/yum.repos.d/oneAPI.repo && \
213+
echo "gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
214+
echo "repo_gpgcheck=1" >> /etc/yum.repos.d/oneAPI.repo && \
215+
echo "gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" >> /etc/yum.repos.d/oneAPI.repo
216+
217+
RUN dnf install --allowerasing -y intel-oneapi-mkl-2024.2.0 && \
201218
dnf clean all && rm -rf /var/cache/yum
202219

220+
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.2/lib:${LD_LIBRARY_PATH}
221+
203222
RUN rm -rf /tmp/*
204223

205224
USER 1001

enterprise/redhat/openshift-ai/gaudi/docker/docker-compose.yaml

+11-11
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ services:
2222
https_proxy: ${https_proxy}
2323
no_proxy: ""
2424
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
25-
VERSION: ${VERSION:-1.17.1}
26-
REVISION: ${REVISION:-40}
25+
VERSION: ${VERSION:-1.18.0}
26+
REVISION: ${REVISION:-524}
2727
context: .
2828
target: gaudi-base
2929
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
30-
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
30+
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-base-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
3131
entrypoint: ["/bin/bash", "-c"]
3232
command: >
3333
"hl-smi"
@@ -37,17 +37,17 @@ services:
3737
BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi}
3838
BASE_TAG: ${RHEL_OS:-9.2}
3939
BASE_NAME: rhel${RHEL_OS:-rhel9.2}
40-
PT_VERSION: ${PT_VERSION:-2.3.1}
40+
PT_VERSION: ${PT_VERSION:-2.4.0}
4141
http_proxy: ${http_proxy}
4242
https_proxy: ${https_proxy}
4343
no_proxy: ""
4444
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
45-
VERSION: ${VERSION:-1.17.1}
46-
REVISION: ${REVISION:-40}
45+
VERSION: ${VERSION:-1.18.0}
46+
REVISION: ${REVISION:-524}
4747
context: .
4848
target: gaudi-pytorch
4949
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
50-
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
50+
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-pytorch-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
5151
entrypoint: ["/bin/bash", "-c"]
5252
command: >
5353
"python -c 'import torch'"
@@ -57,17 +57,17 @@ services:
5757
BASE_IMAGE: ${BASE_IMAGE:-registry.access.redhat.com/ubi9/ubi}
5858
BASE_TAG: ${RHEL_OS:-9.2}
5959
BASE_NAME: ${BASE_NAME:-rhel9.2}
60-
PT_VERSION: ${PT_VERSION:-2.3.1}
60+
PT_VERSION: ${PT_VERSION:-2.4.0}
6161
http_proxy: ${http_proxy}
6262
https_proxy: ${https_proxy}
6363
no_proxy: ""
6464
ARTIFACTORY_URL: ${ARTIFACTORY_URL:-vault.habana.ai}
65-
VERSION: ${VERSION:-1.17.1}
66-
REVISION: ${REVISION:-40}
65+
VERSION: ${VERSION:-1.18.0}
66+
REVISION: ${REVISION:-524}
6767
context: .
6868
target: gaudi-notebooks
6969
dockerfile: Dockerfile.rhel${RHEL_OS:-9.2}
70-
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.17.1}-${REVISION:-40}-rhel-${RHEL_OS:-9.2}
70+
image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-notebook-${VERSION:-1.18.0}-${REVISION:-524}-rhel-${RHEL_OS:-9.2}
7171
entrypoint: ["/bin/bash", "-c"]
7272
command: >
7373
"python -m jupyter notebook --version"

enterprise/redhat/openshift-ai/gaudi/docker/install_efa.sh

+33-19
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,41 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
DEFAULT_EFA_INSTALLER_VER=1.29.0
17+
#!/bin/bash -ex
18+
19+
DEFAULT_EFA_INSTALLER_VER=1.34.0
1820
efa_installer_version=${1:-$DEFAULT_EFA_INSTALLER_VER}
1921

2022
tmp_dir=$(mktemp -d)
21-
wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-"$efa_installer_version".tar.gz -P "$tmp_dir"
22-
tar -xf "$tmp_dir"/aws-efa-installer-"$efa_installer_version".tar.gz -C "$tmp_dir"
23-
pushd "$tmp_dir"/aws-efa-installer
24-
# shellcheck disable=SC1091
25-
case $(
26-
. /etc/os-release
27-
echo -n "$ID"
28-
) in
29-
rhel)
30-
# we cannot install dkms packages on RHEL images due to OCP rules
31-
rm -f RPMS/RHEL8/x86_64/dkms*.rpm
32-
;;
33-
tencentos)
34-
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-46.0-1.el8.x86_64.rpm RPMS/ROCKYLINUX8/x86_64/rdma-core/libibverbs-utils-46.0-1.el8.x86_64.rpm
35-
patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch
36-
;;
23+
wget -nv https://efa-installer.amazonaws.com/aws-efa-installer-$efa_installer_version.tar.gz -P $tmp_dir
24+
tar -xf $tmp_dir/aws-efa-installer-$efa_installer_version.tar.gz -C $tmp_dir
25+
RUN_EFA_INSTALLER="./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify"
26+
pushd $tmp_dir/aws-efa-installer
27+
. /etc/os-release
28+
case $ID in
29+
rhel)
30+
# we cannot install dkms packages on RHEL images due to OCP rules
31+
find RPMS/ -name 'dkms*.rpm' -exec rm -f {} \;
32+
find RPMS/ -name 'efa-*.rpm' -exec rm -f {} \;
33+
case $VERSION_ID in
34+
8*)
35+
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
36+
;;
37+
9*)
38+
dnf install -y RPMS/ROCKYLINUX9/x86_64/rdma-core/*.rpm
39+
;;
40+
*)
41+
echo "Unsupported RHEL version: $VERSION_ID"
42+
exit 1
43+
;;
44+
esac
45+
RUN_EFA_INSTALLER="echo 'Skipping EFA installer on RHEL'"
46+
;;
47+
tencentos)
48+
dnf install -y RPMS/ROCKYLINUX8/x86_64/rdma-core/*.rpm
49+
patch -f -p1 -i /tmp/tencentos_efa_patch.txt --reject-file=tencentos_efa_patch.rej --no-backup-if-mismatch
50+
;;
3751
esac
38-
./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify
52+
eval $RUN_EFA_INSTALLER
3953
popd
40-
rm -rf "$tmp_dir"
54+
rm -rf $tmp_dir

0 commit comments

Comments
 (0)