From 785b3b73b967445314b8f2f5a555b2314ff4c91b Mon Sep 17 00:00:00 2001 From: Michael Mi Date: Sun, 13 Oct 2024 11:35:25 -0700 Subject: [PATCH] upgrade pytorch to 2.4.1 (#341) --- .github/workflows/build_wheel.yml | 2 +- .github/workflows/package_test.yml | 2 +- .github/workflows/publish_wheel.yml | 2 +- .github/workflows/release_test.yml | 2 +- CMakeLists.txt | 20 ++++++++++---------- README.md | 2 +- docker/common/install_cuda.sh | 27 +++++++++++++++++++-------- 7 files changed, 34 insertions(+), 23 deletions(-) diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml index cc28e3d0..9d567f10 100644 --- a/.github/workflows/build_wheel.yml +++ b/.github/workflows/build_wheel.yml @@ -25,7 +25,7 @@ jobs: matrix: python: ["3.8", "3.9", "3.10", "3.11", "3.12"] cuda: ["11.8", "12.1", "12.4"] - torch: ["2.2.2", "2.3.1", "2.4.0"] + torch: ["2.2.2", "2.3.1", "2.4.1"] exclude: - cuda: "12.4" torch: "2.3.1" diff --git a/.github/workflows/package_test.yml b/.github/workflows/package_test.yml index 553b6303..e701c715 100644 --- a/.github/workflows/package_test.yml +++ b/.github/workflows/package_test.yml @@ -45,7 +45,7 @@ jobs: matrix: python: ["3.10"] cuda: ["12.4"] - torch: ["2.4.0"] + torch: ["2.4.1"] runs-on: [self-hosted, linux, build] env: PYTHON_VERSION: ${{ matrix.python }} diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml index e9d7c62c..75b3bbd1 100644 --- a/.github/workflows/publish_wheel.yml +++ b/.github/workflows/publish_wheel.yml @@ -24,7 +24,7 @@ jobs: matrix: python: ["3.8", "3.9", "3.10", "3.11", "3.12"] cuda: ["12.1"] - torch: ["2.4.0"] + torch: ["2.4.1"] runs-on: [self-hosted, linux, release] env: PYTHON_VERSION: ${{ matrix.python }} diff --git a/.github/workflows/release_test.yml b/.github/workflows/release_test.yml index dccf5721..92f4f455 100644 --- a/.github/workflows/release_test.yml +++ b/.github/workflows/release_test.yml @@ -15,7 +15,7 @@ jobs: matrix: python: ["3.10"] cuda: ["12.4"] - torch: ["2.4.0"] + torch: ["2.4.1"] runs-on: [self-hosted, linux, release] env: PYTHON_VERSION: ${{ matrix.python }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 85049374..9d4b8667 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,25 +194,25 @@ if (DEFINED ENV{LIBTORCH_ROOT}) else() include(FetchContent) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4) - # download libtorch 2.4.0 with cuda 12.4 from pytorch.org + # download libtorch 2.4.1 with cuda 12.4 from pytorch.org if (USE_CXX11_ABI) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu124.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu124.zip") else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-shared-with-deps-2.4.0%2Bcu124.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu124/libtorch-shared-with-deps-2.4.1%2Bcu124.zip") endif() elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1) - # download libtorch 2.4.0 with cuda 12.1 from pytorch.org + # download libtorch 2.4.1 with cuda 12.1 from pytorch.org if (USE_CXX11_ABI) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu121.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu121.zip") else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-shared-with-deps-2.4.0%2Bcu121.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-shared-with-deps-2.4.1%2Bcu121.zip") endif() elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.8) - # download libtorch 2.4.0 with cuda 11.8 from pytorch.org + # download libtorch 2.4.1 with cuda 11.8 from pytorch.org if (USE_CXX11_ABI) - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.4.0%2Bcu118.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.4.1%2Bcu118.zip") else() - set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.4.0%2Bcu118.zip") + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu118/libtorch-shared-with-deps-2.4.1%2Bcu118.zip") endif() else() # error out if cuda version is not supported @@ -232,7 +232,7 @@ else() FetchContent_MakeAvailable(libtorch) find_package(Torch REQUIRED PATHS ${libtorch_SOURCE_DIR} NO_DEFAULT_PATH) - message(STATUS "Downloading and using libtorch 2.4.0 for cuda ${CUDA_VERSION} at ${libtorch_SOURCE_DIR}") + message(STATUS "Downloading and using libtorch 2.4.1 for cuda ${CUDA_VERSION} at ${libtorch_SOURCE_DIR}") endif() # check if USE_CXX11_ABI is set correctly diff --git a/README.md b/README.md index 5d89b4c4..d5286758 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ ScaleLLM is currently undergoing active development. We are fully committed to c ScaleLLM is available as a Python Wheel package on PyPI. You can install it using pip: ```bash -# Install scalellm with CUDA 12.1 and Pytorch 2.4.0 +# Install scalellm with CUDA 12.1 and Pytorch 2.4.1 pip install -U scalellm ``` diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh index 1e90973c..413c08f9 100755 --- a/docker/common/install_cuda.sh +++ b/docker/common/install_cuda.sh @@ -1,6 +1,6 @@ #!/bin/bash -# adapted from https://github.com/pytorch/builder/blob/main/common/install_cuda.sh +# adapted from https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_cuda.sh set -ex @@ -29,6 +29,17 @@ function install_cusparselt_052 { rm -rf tmp_cusparselt } +function install_cusparselt_062 { + # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html + mkdir tmp_cusparselt && pushd tmp_cusparselt + wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz + tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz + cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/ + cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cusparselt +} + function install_118 { echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0" rm -rf /usr/local/cuda-11.8 /usr/local/cuda @@ -96,13 +107,13 @@ function install_121 { } function install_124 { - echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2" + echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2" rm -rf /usr/local/cuda-12.4 /usr/local/cuda - # install CUDA 12.4.0 in the same container - wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run - chmod +x cuda_12.4.0_550.54.14_linux.run - ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent - rm -f cuda_12.4.0_550.54.14_linux.run + # install CUDA 12.4.1 in the same container + wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run + chmod +x cuda_12.4.1_550.54.15_linux.run + ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent + rm -f cuda_12.4.1_550.54.15_linux.run rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement @@ -123,7 +134,7 @@ function install_124 { cd .. rm -rf nccl - install_cusparselt_052 + install_cusparselt_062 ldconfig }