Skip to content

Commit

Permalink
[fix] added manylinux support (#185)
Browse files Browse the repository at this point in the history
- [x] fix the build error for cuda 11.8
- [ ] add cuda 11.8 support for manylinux


/usr/local/cuda/bin/../targets/x86_64-linux/include/crt/host_config.h:132:2:
error: #error -- unsupported GNU version! gcc versions later than 11 are
not supported! The nvcc flag '-allow-unsupported-compiler' can be used
to override this version check; however, using an unsupported host
compiler may cause compilation failure or incorrect run time execution.
Use at your own risk.
132 | #error -- unsupported GNU version! gcc versions later than 11 are
not supported! The nvcc flag '-allow-unsupported-compiler' can be used
to override this version check; however, using an unsupported host
compiler may cause compilation failure or incorrect run time execution.
Use at your own risk.
  • Loading branch information
guocuimi authored May 14, 2024
1 parent 05dd8f2 commit 040d9cf
Show file tree
Hide file tree
Showing 15 changed files with 413 additions and 76 deletions.
49 changes: 0 additions & 49 deletions .github/workflows/publish_base_image.yml

This file was deleted.

5 changes: 5 additions & 0 deletions .github/workflows/publish_devel_image.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: Publish devel docker image
on:
workflow_dispatch:
env:
# Tells where to store caches.
CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache

jobs:
publish_base:
Expand All @@ -27,6 +30,8 @@ jobs:
context: ./docker
file: ./docker/Dockerfile.devel
push: true
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=12.1
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/publish_manylinux_image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Publish manylinux docker image
on:
workflow_dispatch:
env:
# Tells where to store caches.
CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache

jobs:
publish_base:
runs-on: [self-hosted, linux, release]
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_HUB_USER }}
password: ${{ secrets.DOCKER_HUB_TOKEN }}

- name: Build base for cuda 12.1
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.manylinux
push: true
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
CUDA_VERSION=12.1
tags: |
vectorchai/scalellm_manylinux:cuda12.1
# - name: Build base for cuda 11.8
# uses: docker/build-push-action@v5
# with:
# context: ./docker
# file: ./docker/Dockerfile.manylinux
# push: true
# cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
# cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
# build-args: |
# CUDA_VERSION=11.8
# tags: |
# vectorchai/scalellm_manylinux:cuda11.8

9 changes: 5 additions & 4 deletions .github/workflows/release_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ env:
jobs:
build_wheel:
strategy:
fail-fast: false
matrix:
python: ["3.9", "3.10", "3.11", "3.12"]
cuda: ["11.8", "12.1"]
torch: ["2.1", "2.2", "2.3"]
python: ["3.9", "3.10", "3.11"]
cuda: ["12.1"]
torch: ["2.2", "2.3"]
runs-on: [self-hosted, linux, release]
steps:
- name: Checkout repository
Expand All @@ -36,7 +37,7 @@ jobs:
-e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \
-e CCACHE_DIR=/ci_cache/.ccache \
--user $(id -u):$(id -g) \
vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \
vectorchai/scalellm_manylinux:cuda${{ matrix.cuda }} \
bash /ScaleLLM/scripts/build_wheel.sh
timeout-minutes: 60

Expand Down
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)

option(USE_CCACHE "Attempt using CCache to wrap the compilation" ON)
option(USE_CXX11_ABI "Use the new C++-11 ABI, which is not backwards compatible." ON)
option(USE_MANYLINUX "Build for manylinux" OFF)

set(CMAKE_POSITION_INDEPENDENT_CODE ON)

Expand Down Expand Up @@ -144,7 +145,6 @@ if(UNIX)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og")
endif()

find_package(Boost CONFIG REQUIRED)
find_package(Threads REQUIRED)
# find all dependencies from vcpkg
find_package(fmt CONFIG REQUIRED)
Expand All @@ -162,7 +162,13 @@ find_package(prometheus-cpp CONFIG REQUIRED)
find_package(stduuid CONFIG REQUIRED)
find_package(RapidJSON CONFIG REQUIRED)

find_package(Python REQUIRED COMPONENTS Interpreter Development)
if (USE_MANYLINUX)
# manylinux doesn't ship Development.Embed
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
else()
find_package(Python REQUIRED COMPONENTS Interpreter Development)
endif()

find_package(NCCL REQUIRED)

if (USE_CXX11_ABI)
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# Install gcc
ARG GCC_VERSION=12
ARG GCC_VERSION=11
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
software-properties-common gpg-agent
Expand Down
48 changes: 48 additions & 0 deletions docker/Dockerfile.manylinux
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
FROM quay.io/pypa/manylinux_2_28_x86_64 as base

LABEL maintainer="mi@vectorch.com"
ENV DEBIAN_FRONTEND noninteractive

ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8

# Install common dependencies
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh

# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh

# Install cuda, cudnn and nccl
ARG CUDA_VERSION=12.1
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# ARG CMAKE_VERSION=3.18.5
# COPY ./common/install_cmake.sh install_cmake.sh
# RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
# RUN rm install_cmake.sh

ARG NINJA_VERSION=1.11.1
COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh

ARG CCACHE_VERSION=4.8.3
COPY ./common/install_ccache.sh install_ccache.sh
RUN if [ -n "${CCACHE_VERSION}" ]; then bash ./install_ccache.sh; fi
RUN rm install_ccache.sh

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
ENV CARGO_HOME=/usr/local/cargo
ENV PATH=/usr/local/cargo/bin:$PATH
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
# give everyone permission to use rust
RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME}
RUN rustup --version; cargo --version; rustc --version

CMD ["bash"]
20 changes: 20 additions & 0 deletions docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,31 @@ install_ubuntu() {
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
}

install_almalinux() {
yum -y update
yum -y install \
zip \
wget \
curl \
perl \
sudo \
vim \
jq \
libtool \
unzip

# Cleanup
yum clean all
}

ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
ubuntu)
install_ubuntu
;;
almalinux)
install_almalinux
;;
*)
echo "Unable to determine OS..."
exit 1
Expand Down
16 changes: 16 additions & 0 deletions docker/common/install_ccache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -ex

[ -n "$CCACHE_VERSION" ]

ARCH=$(uname -m)
url=https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-${ARCH}.tar.xz

pushd /tmp
curl -L "$url" | xz -d | tar -x
cp ./ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/bin/ccache
popd

# set max cache size to 5GiB
/usr/bin/ccache -M 5Gi
3 changes: 3 additions & 0 deletions docker/common/install_cmake.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ case "$ID" in
ubuntu)
apt-get remove cmake -y
;;
almalinux)
rm /usr/local/bin/cmake
;;
*)
echo "Unable to determine OS..."
exit 1
Expand Down
Loading

0 comments on commit 040d9cf

Please sign in to comment.