Skip to content

Commit

Permalink
[CI] fix docker image issues and build wheel for different python, py…
Browse files Browse the repository at this point in the history
…torch versions (#184)
  • Loading branch information
guocuimi authored May 14, 2024
1 parent 6f1f1b6 commit 05dd8f2
Show file tree
Hide file tree
Showing 9 changed files with 68 additions and 51 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: build and test
name: Build and test

on:
push:
Expand Down Expand Up @@ -46,11 +46,11 @@ jobs:
env:
BUILD_TYPE: ${{ matrix.build_type }}
# Tells vcpkg where binary packages are stored.
VCPKG_DEFAULT_BINARY_CACHE: ${{ github.workspace }}/../../_vcpkg/bincache
VCPKG_DEFAULT_BINARY_CACHE: ${{ github.workspace }}/../../ci_cache/.vcpkg/bincache
# Tells ccache where to store its cache.
CCACHE_DIR: ${{ github.workspace }}/../../_ccache
CCACHE_DIR: ${{ github.workspace }}/../../ci_cache/.ccache
# Tells where to store the dependencies.
DEPENDENCES_ROOT: ${{ github.workspace }}/../../_deps
DEPENDENCES_ROOT: ${{ github.workspace }}/../../ci_cache/.deps

steps:
- name: Install toolkits
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/format.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: format
name: Format
on:
pull_request:
branches:
Expand Down
8 changes: 2 additions & 6 deletions .github/workflows/publish_base_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,9 @@ jobs:
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=12.1
GCC_VERSION=12
CMAKE_VERSION=3.18.5
NINJA_VERSION=1.9.0
tags: |
vectorchai/scalellm_builder:cuda12.1-ubuntu22.04
vectorchai/scalellm_builder:cuda12.1
- name: Build base for cuda 11.8
uses: docker/build-push-action@v5
Expand All @@ -45,9 +43,7 @@ jobs:
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=11.8
GCC_VERSION=12
CMAKE_VERSION=3.18.5
NINJA_VERSION=1.9.0
tags: |
vectorchai/scalellm_builder:cuda11.8-ubuntu22.04
vectorchai/scalellm_builder:cuda11.8
3 changes: 0 additions & 3 deletions .github/workflows/publish_devel_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ jobs:
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=12.1
GCC_VERSION=12
CMAKE_VERSION=3.18.5
NINJA_VERSION=1.9.0
tags: |
vectorchai/scalellm_devel:cuda12.1-ubuntu22.04
vectorchai/scalellm_devel:latest
Expand Down
34 changes: 15 additions & 19 deletions .github/workflows/release_wheel.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
name: Release wheel
# Build & Push scalellm docker image on creation of tags to https://hub.docker.com/r/vectorchai/scalellm
# Push events to matching v*, i.e. v1.0.0, v1.0.0-rc1, v20.15.10-rc5, etc.
# on:
# push:
# tags:
# - v[0-9]+.[0-9]+.[0-9]+*
on:
workflow_dispatch:
inputs:
Expand All @@ -20,9 +14,9 @@ jobs:
build_wheel:
strategy:
matrix:
python: ["3.9"]
cuda: ["12.1"]
torch: ["2.3"]
python: ["3.9", "3.10", "3.11", "3.12"]
cuda: ["11.8", "12.1"]
torch: ["2.1", "2.2", "2.3"]
runs-on: [self-hosted, linux, release]
steps:
- name: Checkout repository
Expand All @@ -32,16 +26,18 @@ jobs:

- name: Build wheel
run: |
docker run --rm -t \
-v "$CI_CACHE_DIR":/ci_cache \
-v "$GITHUB_WORKSPACE":/ScaleLLM \
-e PYTHON_VERSION=${{ matrix.python }} \
-e CUDA_VERSION=${{ matrix.cuda }} \
-e TORCH_VERSION=${{ matrix.torch }} \
-e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \
-e CCACHE_DIR=/ci_cache/.ccache \
vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \
bash /ScaleLLM/scripts/build_wheel.sh
docker pull vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04
docker run --rm -t \
-v "$CI_CACHE_DIR":/ci_cache \
-v "$GITHUB_WORKSPACE":/ScaleLLM \
-e PYTHON_VERSION=${{ matrix.python }} \
-e CUDA_VERSION=${{ matrix.cuda }} \
-e TORCH_VERSION=${{ matrix.torch }} \
-e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \
-e CCACHE_DIR=/ci_cache/.ccache \
--user $(id -u):$(id -g) \
vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \
bash /ScaleLLM/scripts/build_wheel.sh
timeout-minutes: 60

- name: show wheel size
Expand Down
11 changes: 10 additions & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ ENV DEBIAN_FRONTEND noninteractive
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh

# Install multiple python versions
COPY ./common/install_python.sh install_python.sh
RUN bash ./install_python.sh "3.9.0"
Expand Down Expand Up @@ -43,7 +47,12 @@ RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH=$HOME/.cargo/bin:$PATH
# give everyone permission to use rust
RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME}
RUN rustup --version; cargo --version; rustc --version

CMD ["bash"]
28 changes: 13 additions & 15 deletions docker/Dockerfile.devel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ ENV DEBIAN_FRONTEND noninteractive
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh

# Install multiple python versions
COPY ./common/install_python.sh install_python.sh
RUN bash ./install_python.sh "3.9.0"
Expand Down Expand Up @@ -42,6 +46,15 @@ COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
# give everyone permission to use rust
RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME}
RUN rustup --version; cargo --version; rustc --version

# install jemalloc (optional)
RUN cd /tmp && \
wget -nc --no-check-certificate https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
Expand All @@ -51,21 +64,6 @@ RUN cd /tmp && \
make -j$(nproc) && make install && \
ldconfig)

# install nsys
ADD https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_2/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb .
RUN apt-get install -y ./nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb

# install rust
ARG UID=1000
ARG GID=1000
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
# change owner so that non-root user can install rust packages
RUN chown -R $UID:$GID /usr/local/rustup
RUN chown -R $UID:$GID /usr/local/cargo

# Install miniconda
RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/Miniconda3-latest-Linux-x86_64.sh
RUN cd /tmp && \
Expand Down
16 changes: 16 additions & 0 deletions docker/common/install_user.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -ex

# mirror jenkins user in container
echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
echo "jenkins:x:1000:" >> /etc/group
# needed on focal or newer
echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow

# allow sudo
echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins


# test that sudo works
sudo -u jenkins sudo -v
9 changes: 7 additions & 2 deletions tools/run_in_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ function get_switch_user_cmd() {

(( $# < 1 )) && usage

IMAGE="vectorchai/scalellm:devel"
IMAGE="vectorchai/scalellm_devel:latest"
RUN_OPTS=()
while [[ $# > 1 ]]; do
case "$1" in
Expand All @@ -61,7 +61,12 @@ RUN_OPTS+=(--rm -it --network=host)
RUN_OPTS+=("-v $(pwd):$(pwd)")
RUN_OPTS+=("-v /tmp:/tmp")
RUN_OPTS+=("-v ${HOME}:${HOME}")
CMD="bash -c 'cd $(pwd); VCPKG_DEFAULT_BINARY_CACHE=$(pwd)/.vcpkg/bincache $@'"

# carry over some environment variables
RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}")
RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}")

CMD="sh -c 'cd $(pwd); $@'"

[[ "${CMD}" = "" ]] && usage
[[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command missing from PATH." && usage
Expand Down

0 comments on commit 05dd8f2

Please sign in to comment.