Skip to content

Commit fc9afb4

Browse files
Adds a Hugging Face distributed LLM fine tuning CPU workflow with k8s (#98)
Signed-off-by: Dina Suehiro Jones <dina.s.jones@intel.com> Signed-off-by: dmsuehir <dina.s.jones@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 9309a31 commit fc9afb4

24 files changed

+2869
-1
lines changed

.github/linters/.yaml-lint.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,6 @@
1313
# limitations under the License.
1414

1515
---
16-
ignore: '**/templates/**.yaml'
16+
ignore:
17+
- '**/templates/**.yaml'
18+
- '**/templates/tests/**.yaml'

pytorch/Dockerfile

+9
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ RUN python -m pip install --no-cache-dir -r multinode-requirements.txt
9494
ARG PYTHON_VERSION
9595
RUN echo "source /usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh" >> ~/.bashrc
9696

97+
98+
ENV I_MPI_ROOT="${I_MPI_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
99+
ENV CCL_ROOT="${CCL_ROOT}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch"
100+
ENV FI_PROVIDER_PATH="${FI_PROVIDER_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov"
101+
ENV LIBRARY_PATH="${LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
102+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/lib"
103+
ENV PATH="${PATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/bin"
104+
ENV CPATH="${CPATH}:/usr/local/lib/python${PYTHON_VERSION}/dist-packages/oneccl_bindings_for_pytorch/include"
105+
97106
RUN mkdir -p /licensing
98107

99108
RUN wget -q --no-check-certificate https://raw.githubusercontent.com/oneapi-src/oneCCL/b7d66de16e17f88caffd7c6df4cd5e12b266af84/third-party-programs.txt -O /licensing/oneccl_third_party_programs.txt && \

workflows/.actions.json

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"experimental": [true],
3+
"runner_label": ["PVC"]
4+
}

workflows/README.md

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Workflows
2+
3+
This directory contains workflows demonstrating showing how the Intel Optimized base containers can be used for
4+
different use cases:
5+
6+
## PyTorch Workflows
7+
8+
| Base Container | Device Type | Example | Description |
9+
|----------------|-------------|---------|-------------|
10+
| `intel/intel-optimized-pytorch:2.3.0-pip-multinode` | CPU | [Distributed LLM Fine Tuning with Kubernetes](charts/huggingface-llm) | Demonstrates using Hugging Face Transformers with Intel® Xeon® Scalable Processors to fine tune LLMs with multiple nodes from a Kubernetes cluster. The example includes a LLM fine tuning script, Dockerfile, and Helm chart. |
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Copyright (c) 2023 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# SPDX-License-Identifier: Apache-2.0
16+
17+
18+
apiVersion: v2
19+
name: Hugging Face PyTorch Distributed Training
20+
description: This Helm chart deploys a distributed training job using the Kubeflow PyTorchJob training operator.
21+
22+
maintainers:
23+
- name: dmsuehir
24+
email: dina.s.jones@intel.com
25+
url: https://github.com/dmsuehir
26+
27+
# A chart can be either an 'application' or a 'library' chart.
28+
#
29+
# Application charts are a collection of templates that can be packaged into versioned archives
30+
# to be deployed.
31+
#
32+
# Library charts provide useful utilities or functions for the chart developer. They're included as
33+
# a dependency of application charts to inject those utilities and functions into the rendering
34+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
35+
type: application
36+
37+
# This is the chart version. This version number should be incremented each time you make changes
38+
# to the chart and its templates, including the app version.
39+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
40+
version: 0.2.0
41+
42+
# This is the version number of the application being deployed. This version number should be
43+
# incremented each time you make changes to the application. Versions are not expected to
44+
# follow Semantic Versioning. They should reflect the version the application is using.
45+
# It is recommended to use it with quotes.
46+
appVersion: "1.16.0"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) 2023 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# SPDX-License-Identifier: Apache-2.0
16+
17+
18+
ARG BASE_IMAGE_NAME=intel/intel-optimized-pytorch
19+
ARG BASE_IMAGE_TAG=2.3.0-pip-multinode
20+
21+
# Base image to be used everywhere
22+
FROM ${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} as base
23+
RUN apt-get clean && \
24+
apt-get update && \
25+
apt-get upgrade -y && \
26+
apt-get clean autoclean && \
27+
apt-get autoremove -y && \
28+
rm -rf /var/lib/apt/lists/*
29+
30+
SHELL ["/bin/bash", "-c"]
31+
32+
RUN apt-get update -y && \
33+
apt-get install -y --no-install-recommends --fix-missing \
34+
google-perftools \
35+
libjemalloc2 \
36+
libomp-dev \
37+
numactl
38+
39+
WORKDIR /workspace
40+
COPY requirements.txt .
41+
42+
RUN python -m pip install --no-cache-dir -r requirements.txt
43+
44+
# Install OpenSSH for MPI to communicate between containers
45+
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
46+
openssh-client \
47+
openssh-server && \
48+
rm /etc/ssh/ssh_host_*_key \
49+
/etc/ssh/ssh_host_*_key.pub && \
50+
apt-get clean autoclean && \
51+
apt-get autoremove -y && \
52+
rm -rf /var/lib/apt/lists/*
53+
54+
# Allow OpenSSH to talk to containers without asking for confirmation
55+
# hadolint global ignore=SC2002
56+
RUN mkdir -p /var/run/sshd && \
57+
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
58+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
59+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
60+
61+
COPY scripts /workspace/scripts/
62+
63+
COPY generate_ssh_keys.sh /workspace/generate_ssh_keys.sh
64+
65+
RUN cat /workspace/generate_ssh_keys.sh >> "${HOME}/.bash_profile" && \
66+
sed -i 's#source /inc/bin/activate##g' "${HOME}/.bashrc" && \
67+
cat /workspace/generate_ssh_keys.sh >> "${HOME}/.bashrc"
68+
69+
ENV BASH_ENV=${HOME}/.bash_profile

0 commit comments

Comments
 (0)