From 05dd8f267ef7402e069d7a819561a708d2a30415 Mon Sep 17 00:00:00 2001 From: Michael Mi Date: Mon, 13 May 2024 21:42:20 -0700 Subject: [PATCH] [CI] fix docker image issues and build wheel for different python, pytorch versions (#184) --- .github/workflows/build.yml | 8 +++--- .github/workflows/format.yml | 2 +- .github/workflows/publish_base_image.yml | 8 ++---- .github/workflows/publish_devel_image.yml | 3 -- .github/workflows/release_wheel.yml | 34 ++++++++++------------- docker/Dockerfile.base | 11 +++++++- docker/Dockerfile.devel | 28 +++++++++---------- docker/common/install_user.sh | 16 +++++++++++ tools/run_in_docker.sh | 9 ++++-- 9 files changed, 68 insertions(+), 51 deletions(-) create mode 100644 docker/common/install_user.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 491dbb5d..7c432376 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: build and test +name: Build and test on: push: @@ -46,11 +46,11 @@ jobs: env: BUILD_TYPE: ${{ matrix.build_type }} # Tells vcpkg where binary packages are stored. - VCPKG_DEFAULT_BINARY_CACHE: ${{ github.workspace }}/../../_vcpkg/bincache + VCPKG_DEFAULT_BINARY_CACHE: ${{ github.workspace }}/../../ci_cache/.vcpkg/bincache # Tells ccache where to store its cache. - CCACHE_DIR: ${{ github.workspace }}/../../_ccache + CCACHE_DIR: ${{ github.workspace }}/../../ci_cache/.ccache # Tells where to store the dependencies. - DEPENDENCES_ROOT: ${{ github.workspace }}/../../_deps + DEPENDENCES_ROOT: ${{ github.workspace }}/../../ci_cache/.deps steps: - name: Install toolkits diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 17b8aaf6..6ecbfa4e 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -1,4 +1,4 @@ -name: format +name: Format on: pull_request: branches: diff --git a/.github/workflows/publish_base_image.yml b/.github/workflows/publish_base_image.yml index 76e4ef23..26c82726 100644 --- a/.github/workflows/publish_base_image.yml +++ b/.github/workflows/publish_base_image.yml @@ -30,11 +30,9 @@ jobs: build-args: | UBUNTU_VERSION=22.04 CUDA_VERSION=12.1 - GCC_VERSION=12 - CMAKE_VERSION=3.18.5 - NINJA_VERSION=1.9.0 tags: | vectorchai/scalellm_builder:cuda12.1-ubuntu22.04 + vectorchai/scalellm_builder:cuda12.1 - name: Build base for cuda 11.8 uses: docker/build-push-action@v5 @@ -45,9 +43,7 @@ jobs: build-args: | UBUNTU_VERSION=22.04 CUDA_VERSION=11.8 - GCC_VERSION=12 - CMAKE_VERSION=3.18.5 - NINJA_VERSION=1.9.0 tags: | vectorchai/scalellm_builder:cuda11.8-ubuntu22.04 + vectorchai/scalellm_builder:cuda11.8 diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml index a0e3f815..2dcb688d 100644 --- a/.github/workflows/publish_devel_image.yml +++ b/.github/workflows/publish_devel_image.yml @@ -30,9 +30,6 @@ jobs: build-args: | UBUNTU_VERSION=22.04 CUDA_VERSION=12.1 - GCC_VERSION=12 - CMAKE_VERSION=3.18.5 - NINJA_VERSION=1.9.0 tags: | vectorchai/scalellm_devel:cuda12.1-ubuntu22.04 vectorchai/scalellm_devel:latest diff --git a/.github/workflows/release_wheel.yml b/.github/workflows/release_wheel.yml index c8bc041c..4aefb1c6 100644 --- a/.github/workflows/release_wheel.yml +++ b/.github/workflows/release_wheel.yml @@ -1,10 +1,4 @@ name: Release wheel -# Build & Push scalellm docker image on creation of tags to https://hub.docker.com/r/vectorchai/scalellm -# Push events to matching v*, i.e. v1.0.0, v1.0.0-rc1, v20.15.10-rc5, etc. -# on: -# push: -# tags: -# - v[0-9]+.[0-9]+.[0-9]+* on: workflow_dispatch: inputs: @@ -20,9 +14,9 @@ jobs: build_wheel: strategy: matrix: - python: ["3.9"] - cuda: ["12.1"] - torch: ["2.3"] + python: ["3.9", "3.10", "3.11", "3.12"] + cuda: ["11.8", "12.1"] + torch: ["2.1", "2.2", "2.3"] runs-on: [self-hosted, linux, release] steps: - name: Checkout repository @@ -32,16 +26,18 @@ jobs: - name: Build wheel run: | - docker run --rm -t \ - -v "$CI_CACHE_DIR":/ci_cache \ - -v "$GITHUB_WORKSPACE":/ScaleLLM \ - -e PYTHON_VERSION=${{ matrix.python }} \ - -e CUDA_VERSION=${{ matrix.cuda }} \ - -e TORCH_VERSION=${{ matrix.torch }} \ - -e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \ - -e CCACHE_DIR=/ci_cache/.ccache \ - vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \ - bash /ScaleLLM/scripts/build_wheel.sh + docker pull vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 + docker run --rm -t \ + -v "$CI_CACHE_DIR":/ci_cache \ + -v "$GITHUB_WORKSPACE":/ScaleLLM \ + -e PYTHON_VERSION=${{ matrix.python }} \ + -e CUDA_VERSION=${{ matrix.cuda }} \ + -e TORCH_VERSION=${{ matrix.torch }} \ + -e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \ + -e CCACHE_DIR=/ci_cache/.ccache \ + --user $(id -u):$(id -g) \ + vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \ + bash /ScaleLLM/scripts/build_wheel.sh timeout-minutes: 60 - name: show wheel size diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 6c9f694d..fde5c1d4 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -9,6 +9,10 @@ ENV DEBIAN_FRONTEND noninteractive COPY ./common/install_base.sh install_base.sh RUN bash ./install_base.sh && rm install_base.sh +# Install user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + # Install multiple python versions COPY ./common/install_python.sh install_python.sh RUN bash ./install_python.sh "3.9.0" @@ -43,7 +47,12 @@ RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi RUN rm install_ninja.sh # install rust +ENV RUSTUP_HOME=/usr/local/rustup +ENV CARGO_HOME=/usr/local/cargo +ENV PATH=/usr/local/cargo/bin:$PATH RUN curl https://sh.rustup.rs -sSf | sh -s -- -y -ENV PATH=$HOME/.cargo/bin:$PATH +# give everyone permission to use rust +RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME} +RUN rustup --version; cargo --version; rustc --version CMD ["bash"] \ No newline at end of file diff --git a/docker/Dockerfile.devel b/docker/Dockerfile.devel index f974a5be..c4caca89 100644 --- a/docker/Dockerfile.devel +++ b/docker/Dockerfile.devel @@ -9,6 +9,10 @@ ENV DEBIAN_FRONTEND noninteractive COPY ./common/install_base.sh install_base.sh RUN bash ./install_base.sh && rm install_base.sh +# Install user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + # Install multiple python versions COPY ./common/install_python.sh install_python.sh RUN bash ./install_python.sh "3.9.0" @@ -42,6 +46,15 @@ COPY ./common/install_ninja.sh install_ninja.sh RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi RUN rm install_ninja.sh +# install rust +ENV RUSTUP_HOME=/usr/local/rustup +ENV CARGO_HOME=/usr/local/cargo +ENV PATH=/usr/local/cargo/bin:$PATH +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y +# give everyone permission to use rust +RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME} +RUN rustup --version; cargo --version; rustc --version + # install jemalloc (optional) RUN cd /tmp && \ wget -nc --no-check-certificate https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \ @@ -51,21 +64,6 @@ RUN cd /tmp && \ make -j$(nproc) && make install && \ ldconfig) -# install nsys -ADD https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_2/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb . -RUN apt-get install -y ./nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb - -# install rust -ARG UID=1000 -ARG GID=1000 -ENV RUSTUP_HOME=/usr/local/rustup -ENV CARGO_HOME=/usr/local/cargo -ENV PATH=/usr/local/cargo/bin:$PATH -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y -# change owner so that non-root user can install rust packages -RUN chown -R $UID:$GID /usr/local/rustup -RUN chown -R $UID:$GID /usr/local/cargo - # Install miniconda RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/Miniconda3-latest-Linux-x86_64.sh RUN cd /tmp && \ diff --git a/docker/common/install_user.sh b/docker/common/install_user.sh new file mode 100644 index 00000000..9d416049 --- /dev/null +++ b/docker/common/install_user.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +# mirror jenkins user in container +echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd +echo "jenkins:x:1000:" >> /etc/group +# needed on focal or newer +echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow + +# allow sudo +echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins + + +# test that sudo works +sudo -u jenkins sudo -v \ No newline at end of file diff --git a/tools/run_in_docker.sh b/tools/run_in_docker.sh index 9cc1d37f..1485fb85 100755 --- a/tools/run_in_docker.sh +++ b/tools/run_in_docker.sh @@ -42,7 +42,7 @@ function get_switch_user_cmd() { (( $# < 1 )) && usage -IMAGE="vectorchai/scalellm:devel" +IMAGE="vectorchai/scalellm_devel:latest" RUN_OPTS=() while [[ $# > 1 ]]; do case "$1" in @@ -61,7 +61,12 @@ RUN_OPTS+=(--rm -it --network=host) RUN_OPTS+=("-v $(pwd):$(pwd)") RUN_OPTS+=("-v /tmp:/tmp") RUN_OPTS+=("-v ${HOME}:${HOME}") -CMD="bash -c 'cd $(pwd); VCPKG_DEFAULT_BINARY_CACHE=$(pwd)/.vcpkg/bincache $@'" + +# carry over some environment variables +RUN_OPTS+=("-e VCPKG_DEFAULT_BINARY_CACHE=${VCPKG_DEFAULT_BINARY_CACHE}") +RUN_OPTS+=("-e CCACHE_DIR=${CCACHE_DIR}") + +CMD="sh -c 'cd $(pwd); $@'" [[ "${CMD}" = "" ]] && usage [[ ! -x $(command -v docker) ]] && echo "ERROR: 'docker' command missing from PATH." && usage