diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e23d9e5fb..97dfd7813 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,7 +12,7 @@ concurrency: jobs: cmake-build: name: Build FlexFlow Serve - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 defaults: run: shell: bash -l {0} # required to use an activated conda environment @@ -68,14 +68,14 @@ jobs: ../config/config.linux make -j $n_build_cores make install - # sudo make install - # sudo ldconfig - name: Check availability of flexflow modules in Python run: | if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH" sudo ln -s $CUDA_PATH/lib64/stubs/libcuda.so $CUDA_PATH/lib64/stubs/libcuda.so.1 + else + sudo ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 $CONDA_PREFIX/lib/libstdc++.so.6 fi # Remove build folder to check that the installed version can run independently of the build files rm -rf build diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 3af6ba664..3b2343079 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -3,7 +3,7 @@ on: [push, pull_request, workflow_dispatch] jobs: formatting-check: name: Formatting Check - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: matrix: path: diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 757689d22..ab50784b3 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -14,7 +14,7 @@ concurrency: jobs: docker-build-rocm: name: Build and Install FlexFlow in a Docker Container (ROCm backend) - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }} env: FF_GPU_BACKEND: "hip_rocm" @@ -69,7 +69,7 @@ jobs: docker-build-cuda: name: Build and Install FlexFlow in a Docker Container (CUDA backend) - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: cuda_version: ["11.8", "12.0", "12.1", "12.2"] @@ -119,7 +119,7 @@ jobs: notify-slack: name: Notify Slack in case of failure - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: [docker-build-cuda, docker-build-and-publish-rocm] if: ${{ failure() && github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} steps: diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 748ff5094..d57ff8334 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -129,6 +129,7 @@ jobs: CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }} run: | source ./build/set_python_envs.sh + ./tests/fine_grained_alignment_test.sh ./tests/inference_tests.sh - name: Run PEFT tests diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh index 73b8e8841..1e8038573 100755 --- a/.github/workflows/helpers/install_cudnn.sh +++ b/.github/workflows/helpers/install_cudnn.sh @@ -8,72 +8,11 @@ cd "${BASH_SOURCE[0]%/*}" ubuntu_version=$(lsb_release -rs) ubuntu_version=${ubuntu_version//./} -# Install CUDNN -cuda_version=${1:-12.1.1} -cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') -echo "Installing CUDNN for CUDA version: ${cuda_version} ..." -CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz -CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz -if [[ "$cuda_version" == "10.1" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.1-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-10.1-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "10.2" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.2-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-10.2-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.0" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.0-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-11.0-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.1" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz - CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz -elif [[ "$cuda_version" == "11.2" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-linux-x64-v8.1.1.33.tgz - CUDNN_TARBALL_NAME=cudnn-11.2-linux-x64-v8.1.1.33.tgz -elif [[ "$cuda_version" == "11.3" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.1/cudnn-11.3-linux-x64-v8.2.1.32.tgz - CUDNN_TARBALL_NAME=cudnn-11.3-linux-x64-v8.2.1.32.tgz -elif [[ "$cuda_version" == "11.4" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.4/cudnn-11.4-linux-x64-v8.2.4.15.tgz - CUDNN_TARBALL_NAME=cudnn-11.4-linux-x64-v8.2.4.15.tgz -elif [[ "$cuda_version" == "11.5" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.0/cudnn-11.5-linux-x64-v8.3.0.98.tgz - CUDNN_TARBALL_NAME=cudnn-11.5-linux-x64-v8.3.0.98.tgz -elif [[ "$cuda_version" == "11.6" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.4.0/local_installers/11.6/cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz - CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz -elif [[ "$cuda_version" == "11.7" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz - CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz -elif [[ "$cuda_version" == "11.8" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz - CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then - CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb - CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb -else - echo "CUDNN support for CUDA version above 12.5 not yet added" - exit 1 -fi -wget -c -q $CUDNN_LINK -if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" == "11.8" ]]; then - tar -xf $CUDNN_TARBALL_NAME -C ./ - CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}" - sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include - sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib - rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME" -elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then - wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" - sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt update -y - rm -f cuda-keyring_1.1-1_all.deb - sudo dpkg -i $CUDNN_TARBALL_NAME - sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/ - sudo apt update -y - sudo apt install -y libcudnn8 - sudo apt install -y libcudnn8-dev - sudo apt install -y libcudnn8-samples -else - sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local -fi -rm $CUDNN_TARBALL_NAME +wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt update -y +rm -f cuda-keyring_1.1-1_all.deb +sudo apt-get -y install libcudnn9-cuda-12 +sudo apt-get -y install libcudnn9-dev-cuda-12 +sudo apt-get -y install libcudnn9-samples sudo ldconfig diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh index 9d07ea71f..ac52a80e1 100755 --- a/.github/workflows/helpers/install_dependencies.sh +++ b/.github/workflows/helpers/install_dependencies.sh @@ -40,7 +40,20 @@ if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then elif [ "$hip_version" = "5.5" ]; then AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb fi - AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}" + # Detect Ubuntu version + UBUNTU_VERSION=$(lsb_release -rs) + if [[ "$UBUNTU_VERSION" == "20.04" ]]; then + UBUNTU_CODENAME="focal" + elif [[ "$UBUNTU_VERSION" == "22.04" ]]; then + UBUNTU_CODENAME="jammy" + elif [[ "$UBUNTU_VERSION" == "24.04" ]]; then + UBUNTU_CODENAME="jammy" + else + echo "Unsupported Ubuntu version: $UBUNTU_VERSION" + exit 1 + fi + + AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/${UBUNTU_CODENAME}/${AMD_GPU_SCRIPT_NAME}" # Download and install AMD GPU software with ROCM and HIP support wget "$AMD_GPU_SCRIPT_URL" sudo apt-get install -y ./${AMD_GPU_SCRIPT_NAME} @@ -48,20 +61,20 @@ if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then sudo amdgpu-install -y --usecase=hip,rocm --no-dkms sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs - # Install protobuf v3.20.x manually - sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make - git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git - cd protobuf/ - git submodule update --init --recursive - ./autogen.sh - ./configure - cores_available=$(nproc --all) - n_build_cores=$(( cores_available -1 )) - if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi - make -j $n_build_cores - sudo make install - sudo ldconfig - cd .. + # # Install protobuf v3.20.x manually + # sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make + # git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git + # cd protobuf/ + # git submodule update --init --recursive + # ./autogen.sh + # ./configure + # cores_available=$(nproc --all) + # n_build_cores=$(( cores_available -1 )) + # if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi + # make -j $n_build_cores + # sudo make install + # sudo ldconfig + # cd .. else echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies" fi diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh index ae6793ea2..306e5d699 100755 --- a/.github/workflows/helpers/install_nccl.sh +++ b/.github/workflows/helpers/install_nccl.sh @@ -5,47 +5,10 @@ set -x # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" -# Add NCCL key ring ubuntu_version=$(lsb_release -rs) ubuntu_version=${ubuntu_version//./} -wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" +wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb" sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt update -y +sudo apt-get update -y --allow-change-held-packages rm -f cuda-keyring_1.1-1_all.deb - -# Install NCCL -cuda_version=${1:-12.1.1} -cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.') -echo "Installing NCCL for CUDA version: ${cuda_version} ..." - -# We need to run a different install command based on the CUDA version, otherwise running `sudo apt install libnccl2 libnccl-dev` -# will automatically upgrade CUDA to the latest version. - -if [[ "$cuda_version" == "11.0" ]]; then - sudo apt install libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0 -elif [[ "$cuda_version" == "11.1" ]]; then - sudo apt install libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1 -elif [[ "$cuda_version" == "11.2" ]]; then - sudo apt install libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2 -elif [[ "$cuda_version" == "11.3" ]]; then - sudo apt install libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3 -elif [[ "$cuda_version" == "11.4" ]]; then - sudo apt install libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4 -elif [[ "$cuda_version" == "11.5" ]]; then - sudo apt install libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5 -elif [[ "$cuda_version" == "11.6" ]]; then - sudo apt install libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6 -elif [[ "$cuda_version" == "11.7" ]]; then - sudo apt install libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7 -elif [[ "$cuda_version" == "11.8" ]]; then - sudo apt install libnccl2=2.16.5-1+cuda11.8 libnccl-dev=2.16.5-1+cuda11.8 -elif [[ "$cuda_version" == "12.0" ]]; then - sudo apt install libnccl2=2.18.3-1+cuda12.0 libnccl-dev=2.18.3-1+cuda12.0 -elif [[ "$cuda_version" == "12.1" ]]; then - sudo apt install libnccl2=2.18.3-1+cuda12.1 libnccl-dev=2.18.3-1+cuda12.1 -elif [[ "$cuda_version" == "12.2" ]]; then - sudo apt install libnccl2=2.18.3-1+cuda12.2 libnccl-dev=2.18.3-1+cuda12.2 -else - echo "Installing NCCL for CUDA version ${cuda_version} is not supported" - exit 1 -fi +sudo apt install -y --allow-change-held-packages libnccl2 libnccl-dev diff --git a/.github/workflows/pip-deploy.yml b/.github/workflows/pip-deploy.yml index 5558e51e3..37ea63705 100644 --- a/.github/workflows/pip-deploy.yml +++ b/.github/workflows/pip-deploy.yml @@ -9,7 +9,7 @@ concurrency: jobs: build-n-publish: name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 permissions: # IMPORTANT: this permission is mandatory for trusted publishing id-token: write diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml index f348bfe5e..26a74f8db 100644 --- a/.github/workflows/pip-install.yml +++ b/.github/workflows/pip-install.yml @@ -13,7 +13,7 @@ concurrency: jobs: pip-install-flexflow: name: Install FlexFlow with pip - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 defaults: run: shell: bash -l {0} # required to use an activated conda environment diff --git a/.github/workflows/shell-check.yml b/.github/workflows/shell-check.yml index a825d63d9..f2e31429c 100644 --- a/.github/workflows/shell-check.yml +++ b/.github/workflows/shell-check.yml @@ -3,7 +3,7 @@ on: [push, pull_request, workflow_dispatch] jobs: shellcheck: name: Shellcheck - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - name: Run ShellCheck diff --git a/.gitmodules b/.gitmodules index 913b08886..1f49e93f0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "deps/legion"] path = deps/legion - url = https://gitlab.com/goliaro/legion.git + url = https://github.com/flexflow/legion.git [submodule "deps/nccl"] path = deps/nccl url = https://github.com/NVIDIA/nccl.git diff --git a/conda/flexflow.yml b/conda/flexflow.yml index d457684d0..7568ed648 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -3,23 +3,19 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6,<3.12 - - cffi>=1.11.0 - - Pillow + - python + - cffi - rust - cmake-build-extension - jq - pytest - pip - pip: - - qualname>=0.1.0 - - keras_preprocessing>=1.1.2 - - numpy>=1.16.0 - - torch>=1.13.1 - - torchaudio>=0.13.1 - - torchvision>=0.14.1 + - numpy + - torch + - torchaudio + - torchvision - regex - - onnx - transformers>=4.47.1 - sentencepiece - einops diff --git a/docker/build.sh b/docker/build.sh index 82e6551e6..8d477a6c5 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -4,13 +4,134 @@ set -euo pipefail # Usage: ./build.sh # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version +get_cuda_docker_image() { + local docker_user="nvidia" + local docker_image="cuda" + local page=1 + local per_page=100 + + # Determine Ubuntu version: use lsb_release if available, else default to 22.04. + local ubuntu_version + if command -v lsb_release >/dev/null 2>&1; then + ubuntu_version=$(lsb_release -rs) + else + ubuntu_version="22.04" + fi + + # Determine CUDA version. + # If the environment variable 'cuda_version' is set (in "." format), use that. + # Otherwise, use nvidia-smi to extract the CUDA version. + local cuda_full_version + local installed_major_minor + if [[ -n "${cuda_version:-}" ]]; then + cuda_full_version="$cuda_version" + installed_major_minor="$cuda_version" + else + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "Error: nvidia-smi not found and cuda_version is not set." >&2 + return 1 + fi + local nvidia_smi_output + nvidia_smi_output=$(nvidia-smi) + local cuda_version_line + cuda_version_line=$(echo "$nvidia_smi_output" | grep "CUDA Version") + cuda_full_version=$(echo "$cuda_version_line" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p') + if [[ -z "$cuda_full_version" ]]; then + echo "Error: Unable to determine CUDA version from nvidia-smi." >&2 + return 1 + fi + installed_major_minor="$cuda_full_version" + fi + + # Query Docker Hub for matching tags. + local -a tags_list=() + while true; do + local response new_tags + response=$(curl -s "https://hub.docker.com/v2/repositories/${docker_user}/${docker_image}/tags?page=${page}&page_size=${per_page}") + new_tags=$(echo "$response" | jq -r --arg v "$ubuntu_version" '.results[].name | select(contains("cudnn") and contains("devel-ubuntu") and test("ubuntu"+$v+"$"))') + if [[ -z "$new_tags" ]]; then + break + fi + while read -r tag; do + tags_list+=("$tag") + done <<< "$new_tags" + ((page++)) + done + + if [ ${#tags_list[@]} -eq 0 ]; then + echo "Error: No docker images found matching criteria." >&2 + return 1 + fi + + # Sort the tags in descending order based on the CUDA version. + local sorted_tags + sorted_tags=$(printf "%s\n" "${tags_list[@]}" | sort -rV -t '-' -k1,1) + + # Find the most appropriate tag. + local selected_tag="" + while read -r tag; do + local version tag_major_minor + version=$(echo "$tag" | cut -d '-' -f1) + tag_major_minor=$(echo "$version" | awk -F. '{print $1"."$2}') + if [[ "$tag_major_minor" == "$installed_major_minor" ]]; then + selected_tag="$tag" + break + fi + done <<< "$sorted_tags" + + # If no exact match, choose the highest version lower than the installed version. + if [[ -z "$selected_tag" ]]; then + while read -r tag; do + local version + version=$(echo "$tag" | cut -d '-' -f1) + if [[ $(printf '%s\n' "$version" "$cuda_full_version" | sort -V | head -n1) == "$version" && "$version" != "$cuda_full_version" ]]; then + selected_tag="$tag" + break + fi + done <<< "$sorted_tags" + fi + + if [[ -n "$selected_tag" ]]; then + echo "${docker_user}/${docker_image}:${selected_tag}" + return 0 + else + echo "Error: No suitable docker image found." >&2 + return 1 + fi +} + +set_cuda_version_version() { + # If the user provided a cuda_version, use that. + if [[ -n "${cuda_version:-}" ]]; then + return 0 + fi + + # Otherwise, check that nvidia-smi is available. + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "Error: nvidia-smi not found and cuda_version is not set." >&2 + return 1 + fi + + # Extract the CUDA version from nvidia-smi output. + local nvidia_output cuda_ver + nvidia_output=$(nvidia-smi) + cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p') + + if [[ -z "$cuda_ver" ]]; then + echo "Error: Unable to detect CUDA version from nvidia-smi." >&2 + return 1 + fi + + export cuda_version="$cuda_ver" + return 0 +} + # Cd into flexflow-serve. Assumes this script is in flexflow-serve/docker cd "${BASH_SOURCE[0]%/*}/.." # Parse input params image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} -cuda_version=${cuda_version:-"empty"} hip_version=${hip_version:-"empty"} python_version=${python_version:-latest} @@ -36,36 +157,10 @@ ff_environment_base_image="ubuntu:20.04" gpu_backend_version="" if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then - # Autodetect cuda version if not specified - if [[ $cuda_version == "empty" ]]; then - # shellcheck disable=SC2015 - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) - # Change cuda_version eg. V11.7.99 to 11.7 - cuda_version=${cuda_version:1:4} - if [[ -z "$cuda_version" ]]; then - echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." - exit 1 - fi - fi - # Check that CUDA version is supported, and modify cuda version to include default subsubversion - if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then - cuda_version_input=${cuda_version}.1 - elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then - cuda_version_input=${cuda_version}.2 - elif [[ "$cuda_version" == @(11.4) ]]; then - cuda_version_input=${cuda_version}.3 - elif [[ "$cuda_version" == @(11.8) ]]; then - cuda_version_input=${cuda_version}.0 - elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available) - cuda_version=12.2 - cuda_version_input=${cuda_version}.2 - else - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" - exit 1 - fi - echo "Building $image docker image with CUDA $cuda_version" - ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04" + ff_environment_base_image=$(get_cuda_docker_image) || { echo "Failed to get docker image." >&2; exit 1; } + echo "Using base docker image: $ff_environment_base_image" + set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; } + echo "GPU Backend Version is set to: $cuda_version" gpu_backend_version="-${cuda_version}" fi @@ -90,6 +185,7 @@ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then gpu_backend_version="-${hip_version}" fi + cuda_version="empty" fi # Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking @@ -97,7 +193,7 @@ cores_available=$(nproc --all) n_build_cores=$(( cores_available -1 )) # check python_version -if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then +if [[ "$python_version" != @(3.8|3.9|3.10|3.11|3.12|latest) ]]; then echo "python_version not supported!" exit 0 fi diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile index 25c45646f..92423adf2 100644 --- a/docker/flexflow-environment/Dockerfile +++ b/docker/flexflow-environment/Dockerfile @@ -19,10 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut # Install Python3 with Miniconda ARG python_version "latest" -#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ -RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ - if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \ - echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \ +RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ + if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "3.12" ] && [ "$python_version" != "latest" ]; then \ + echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11, 3.12 or latest (default)}"; \ exit 1; \ fi; \ if [ "${python_version}" = "3.8" ]; then \ @@ -33,6 +32,10 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ MINICONDA_SCRIPT_NAME=Miniconda3-py310_23.5.2-0-Linux-x86_64.sh; \ elif [ "${python_version}" = "3.11" ]; then \ MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ + elif [ "${python_version}" = "3.12" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-py312_25.1.1-2-Linux-x86_64.sh; \ + elif [ "${python_version}" = "latest" ]; then \ + MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \ fi; \ wget -c -q https://repo.continuum.io/miniconda/${MINICONDA_SCRIPT_NAME} && \ mv ./${MINICONDA_SCRIPT_NAME} ~/${MINICONDA_SCRIPT_NAME} && \ @@ -42,7 +45,8 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \ /opt/conda/bin/conda config --set solver classic && \ /opt/conda/bin/conda upgrade --all && \ /opt/conda/bin/conda install conda-build conda-verify && \ - /opt/conda/bin/conda clean -ya + /opt/conda/bin/conda clean -ya && \ + ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6 # set MAKEFLAGS to speedup any dependency that uses make ARG N_BUILD_CORES @@ -58,20 +62,6 @@ ARG FF_GPU_BACKEND "cuda" ARG cuda_version "" ARG hip_version "5.6" -# Update NCCL if FF_GPU_BACKEND is cuda -# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \ -# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \ -# ubuntu_version=$(lsb_release -rs); \ -# ubuntu_version=${ubuntu_version//./}; \ -# wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \ -# DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \ -# DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \ -# rm -f cuda-keyring_1.0-1_all.deb; \ -# DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \ -# else \ -# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \ -# fi' - # Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This # package attempts to re-install cuda even though cuda is already installed @@ -93,22 +83,21 @@ RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ] elif [ "$hip_version" = "5.5" ]; then \ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb; \ fi; \ - AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"; \ + ubuntu_codename=$(lsb_release -cs); \ + AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/${ubuntu_codename}/${AMD_GPU_SCRIPT_NAME}"; \ # Download and install AMD GPU software with ROCM and HIP support wget $AMD_GPU_SCRIPT_URL; \ apt-get install -y ./${AMD_GPU_SCRIPT_NAME}; \ rm ./${AMD_GPU_SCRIPT_NAME}; \ amdgpu-install -y --usecase=hip,rocm --no-dkms; \ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \ - # Install protobuf dependencies - apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \ else \ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \ fi RUN rm -rf /var/lib/apt/lists/* # Install python packages and other dependencies -RUN conda install -c conda-forge cmake make pillow cmake-build-extension numpy pandas keras-preprocessing +RUN conda install -c conda-forge cmake make cmake-build-extension numpy pandas # Install Pytorch COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \ diff --git a/docker/publish.sh b/docker/publish.sh index c70419a9c..bfbbbc6bf 100755 --- a/docker/publish.sh +++ b/docker/publish.sh @@ -4,13 +4,39 @@ set -euo pipefail # Usage: ./publish.sh # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version +set_cuda_version_version() { + # If the user provided a cuda_version, use that. + if [[ -n "${cuda_version:-}" ]]; then + return 0 + fi + + # Otherwise, check that nvidia-smi is available. + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "Error: nvidia-smi not found and cuda_version is not set." >&2 + return 1 + fi + + # Extract the CUDA version from nvidia-smi output. + local nvidia_output cuda_ver + nvidia_output=$(nvidia-smi) + cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p') + + if [[ -z "$cuda_ver" ]]; then + echo "Error: Unable to detect CUDA version from nvidia-smi." >&2 + return 1 + fi + + export cuda_version="$cuda_ver" + return 0 +} + + # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" # Parse input params image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} -cuda_version=${cuda_version:-"empty"} hip_version=${hip_version:-"empty"} # Check docker image name @@ -33,20 +59,11 @@ else fi if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then - # Autodetect cuda version if not specified - if [[ $cuda_version == "empty" ]]; then - # shellcheck disable=SC2015 - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) - # Change cuda_version eg. V11.7.99 to 11.7 - cuda_version=${cuda_version:1:4} - if [[ -z "$cuda_version" ]]; then - echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." - exit 1 - fi - fi + set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; } + # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" + if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8) ]]; then + echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8}" exit 1 fi # Set cuda version suffix to docker image name diff --git a/docker/pull.sh b/docker/pull.sh index f641e1a59..0ac456fa9 100755 --- a/docker/pull.sh +++ b/docker/pull.sh @@ -4,13 +4,39 @@ set -euo pipefail # Usage: ./pull.sh # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version +set_cuda_version_version() { + # If the user provided a cuda_version, use that. + if [[ -n "${cuda_version:-}" ]]; then + return 0 + fi + + # Otherwise, check that nvidia-smi is available. + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "Error: nvidia-smi not found and cuda_version is not set." >&2 + return 1 + fi + + # Extract the CUDA version from nvidia-smi output. + local nvidia_output cuda_ver + nvidia_output=$(nvidia-smi) + cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p') + + if [[ -z "$cuda_ver" ]]; then + echo "Error: Unable to detect CUDA version from nvidia-smi." >&2 + return 1 + fi + + export cuda_version="$cuda_ver" + return 0 +} + + # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" # Parse input params image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} -cuda_version=${cuda_version:-"empty"} hip_version=${hip_version:-"empty"} # Check docker image name @@ -33,26 +59,12 @@ fi gpu_backend_version="" if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then - # Autodetect cuda version if not specified - if [[ $cuda_version == "empty" ]]; then - # shellcheck disable=SC2015 - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) - # Change cuda_version eg. V11.7.99 to 11.7 - cuda_version=${cuda_version:1:4} - if [[ -z "$cuda_version" ]]; then - echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." - exit 1 - fi - fi + set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; } # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then - echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}" + if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8) ]]; then + echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8}" exit 1 fi - # Use CUDA 12.2 for all versions greater or equal to 12.2 for now - if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.2 - fi # Set cuda version suffix to docker image name echo "Downloading $image docker image with CUDA $cuda_version" gpu_backend_version="-${cuda_version}" diff --git a/docker/run.sh b/docker/run.sh index d6c8e4fad..72bc708db 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -4,13 +4,38 @@ set -euo pipefail # Usage: ./run.sh # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version, ATTACH_GPUS, SHM_SIZE +set_cuda_version_version() { + # If the user provided a cuda_version, use that. + if [[ -n "${cuda_version:-}" ]]; then + return 0 + fi + + # Otherwise, check that nvidia-smi is available. + if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "Error: nvidia-smi not found and cuda_version is not set." >&2 + return 1 + fi + + # Extract the CUDA version from nvidia-smi output. + local nvidia_output cuda_ver + nvidia_output=$(nvidia-smi) + cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p') + + if [[ -z "$cuda_ver" ]]; then + echo "Error: Unable to detect CUDA version from nvidia-smi." >&2 + return 1 + fi + + export cuda_version="$cuda_ver" + return 0 +} + # Cd into directory holding this script cd "${BASH_SOURCE[0]%/*}" # Parse input params image=${1:-flexflow} FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda} -cuda_version=${cuda_version:-"empty"} hip_version=${hip_version:-"empty"} # Parameter controlling whether to attach GPUs to the Docker container @@ -49,28 +74,8 @@ fi gpu_backend_version="" if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then - # Autodetect cuda version if not specified - if [[ $cuda_version == "empty" ]]; then - # shellcheck disable=SC2015 - cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true) - # Change cuda_version eg. V11.7.99 to 11.7 - cuda_version=${cuda_version:1:4} - if [[ -z "$cuda_version" ]]; then - echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env." - exit 1 - fi - fi - # Check that CUDA version is supported - if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}" - exit 1 - fi - # Use CUDA 12.2 for all versions greater or equal to 12.2 for now - if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then - cuda_version=12.2 - fi - # Set cuda version suffix to docker image name - echo "Running $image docker image with CUDA $cuda_version" + set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; } + echo "Running $image docker image with CUDA: $cuda_version" gpu_backend_version="-${cuda_version}" fi diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index 5cbe395c1..2e9602e15 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -64,14 +64,12 @@ class BatchConfig { BatchConfig(); int num_active_requests() const; int num_active_tokens() const; - int finetuning_request_index() const; int num_finetuning_fwd_requests() const; int num_finetuning_fwd_tokens() const; int num_finetuning_bwd_requests() const; int num_finetuning_bwd_tokens() const; bool peft_bwd_applies_to_this_layer(int layer) const; - static int max_requests_per_batch(); static int max_tokens_per_batch(); static int max_verify_tokens_per_batch(); @@ -91,8 +89,6 @@ class BatchConfig { static int const MAX_SPEC_TREE_TOKEN_NUM = 64; static int const MAX_PEFT_CONFIG_SIZE = 1024; - // number of tokens in prompt phase, start offset of tokens in inc_decoding - // phase. num_tokens - num_prompt_tokens = num_generation_tokens; int num_tokens = 0, num_generation_tokens = 0; struct PerRequestInfo { diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h index 2cba1cbb9..1deb7ad83 100644 --- a/include/flexflow/flexflow_c.h +++ b/include/flexflow/flexflow_c.h @@ -95,6 +95,10 @@ bool flexflow_config_get_enable_peft(flexflow_config_t handle_); bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_); void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value); +bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_); +void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, + bool value); + void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_, int value); diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h index e0f4dd0ba..7a0187712 100644 --- a/include/flexflow/operator.h +++ b/include/flexflow/operator.h @@ -250,6 +250,8 @@ class Op { std::vector const &, MachineView const *mv = nullptr) { assert(false); + Legion::FutureMap empty_map; + return empty_map; } virtual void print_layer(FFModel const &model) = 0; template diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h index 594c43053..d7d4c10d3 100644 --- a/include/flexflow/ops/inc_multihead_self_attention.h +++ b/include/flexflow/ops/inc_multihead_self_attention.h @@ -185,6 +185,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { void *qk_prods, *qk_prods_softmax; void *attn_heads; BatchConfig::PerTokenInfo *token_infos; + BatchConfig::PerTokenInfo *peft_token_infos; + BatchConfig::PerTokenInfo *peft_token_infos_device; BatchConfig::PerRequestInfo *request_infos; DataType quantization_type; bool offload; @@ -204,7 +206,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta { // PEFT specific fields void *softmax_activation_buffer; void *query_activation_buffer; - size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0; + size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0, + peft_token_infos_size = 0; }; }; // namespace FlexFlow diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h index 0284970d5..b7deeee6c 100644 --- a/include/flexflow/ops/kernels/linear_kernels.h +++ b/include/flexflow/ops/kernels/linear_kernels.h @@ -96,7 +96,7 @@ void store_peft_activations(LinearMeta const *m, BatchConfig const *bc, size_t out_dim, DT *output_ptr, - cudaStream_t stream); + ffStream_t stream); template void peft_bwd_kernel(LinearMeta const *m, BatchConfig const *bc, diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h index 5d34b73fb..7279edb80 100644 --- a/include/flexflow/ops/kernels/softmax_kernels.h +++ b/include/flexflow/ops/kernels/softmax_kernels.h @@ -81,6 +81,12 @@ void peft_bwd_kernel(SoftmaxMeta const *m, DT *input_grad_ptr, int num_classes, ffStream_t stream); +template +void store_peft_activations(SoftmaxMeta *m, + BatchConfig const *bc, + int num_classes, + DT *output_ptr, + ffStream_t stream); template void store_peft_activations(SoftmaxMeta *m, diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h index 1d3f5d421..7cb09e7dc 100644 --- a/include/flexflow/parallel_ops/allreduce.h +++ b/include/flexflow/parallel_ops/allreduce.h @@ -16,6 +16,7 @@ class AllReduce : public ParallelOp { using Input = ParallelTensor; AllReduce(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor input, int allreduce_legion_dim, char const *name = NULL); @@ -75,6 +76,7 @@ class AllReduce : public ParallelOp { Params get_params() const; public: + LayerID layer_guid; int allreduce_dim; }; diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h index a0daac8f9..32713778b 100644 --- a/include/flexflow/parallel_ops/allreduce_params.h +++ b/include/flexflow/parallel_ops/allreduce_params.h @@ -4,6 +4,7 @@ namespace FlexFlow { struct AllReduceParams { + LayerID layer_guid; int allreduce_legion_dim; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h index 424cc9b28..f5ef39055 100644 --- a/include/flexflow/parallel_ops/parallel_identity.h +++ b/include/flexflow/parallel_ops/parallel_identity.h @@ -16,6 +16,7 @@ class ParallelIdentity : public ParallelOp { using Input = ParallelTensor; ParallelIdentity(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor input, int parallel_identity_legion_dim, char const *name = NULL); @@ -75,6 +76,7 @@ class ParallelIdentity : public ParallelOp { Params get_params() const; public: + LayerID layer_guid; int parallel_identity_dim; }; diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h index 6eeed662e..d11ddb5f2 100644 --- a/include/flexflow/parallel_ops/parallel_identity_params.h +++ b/include/flexflow/parallel_ops/parallel_identity_params.h @@ -4,6 +4,7 @@ namespace FlexFlow { struct ParallelIdentityParams { + LayerID layer_guid; int parallel_identity_legion_dim; char name[MAX_OPNAME]; bool is_valid(ParallelTensorShape const &) const; diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 8b07a12d5..8b9bd7552 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -82,7 +82,6 @@ struct InferenceReqProfileInfo { int decoding_step_idx; long long timestamp; }; - struct Request { enum Status { PENDING = 101, // loading prompt @@ -133,10 +132,8 @@ struct Request { int ssm_cache_size = 0; int llm_cache_size = 0; std::vector beam_trees; - Request() = default; static Request from_other(Request const &other); - friend std::ostream &operator<<(std::ostream &os, Request const &req); }; @@ -173,7 +170,6 @@ class RequestManager { size_t get_num_ssms(); bool load_request_token_ids(Request &request); - void set_verbose(bool verbose); void set_max_requests_per_batch(int max_num_requests); int get_max_requests_per_batch(); @@ -186,8 +182,6 @@ class RequestManager { int get_max_verify_tokens_per_batch(); int get_max_sequence_length(); void set_max_sequence_length(int max_seq_length); - int get_max_finetuning_sequence_length(); - void set_max_finetuning_sequence_length(int max_seq_length); void push_spec_infer_tree_width(int tree_width); void set_enable_peft_finetuning(bool enable_peft_finetuning_); void set_inference_finished(bool finished = true); @@ -244,7 +238,6 @@ class RequestManager { void add_peft_config_to_request_info(BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config); - // helpers for prepare_next_batch void process_inf_req_progress(BatchConfig const &old_fwd_bc, InferenceResult const &result); diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt index 623e78c52..1fff8cbf1 100644 --- a/inference/peft/CMakeLists.txt +++ b/inference/peft/CMakeLists.txt @@ -124,4 +124,4 @@ target_include_directories(${project_target5} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} $ target_include_directories(${project_target5} PRIVATE ${CMAKE_SOURCE_DIR}/inference) target_link_libraries(${project_target5} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES}) set(BIN_DEST "bin") -install(TARGETS ${project_target5} DESTINATION ${BIN_DEST}) \ No newline at end of file +install(TARGETS ${project_target5} DESTINATION ${BIN_DEST}) diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc index 09dcc68ee..de922eb51 100644 --- a/inference/peft/peft.cc +++ b/inference/peft/peft.cc @@ -34,7 +34,8 @@ struct FilePaths { std::string cache_folder_path; std::string prompt_file_path; std::string dataset_file_path; - std::string output_folder_path; + std::string output_file_path; + std::string profiling_folder_path; }; void parse_input_args(char **argv, @@ -52,7 +53,8 @@ void parse_input_args(char **argv, int &max_tokens_per_batch, int &max_sequence_length, int &max_training_steps, - int &num_layers_per_finetuning_step) { + int &num_layers_per_finetuning_step, + bool &run_warmup) { for (int i = 1; i < argc; i++) { // llm model type if (!strcmp(argv[i], "-llm-model")) { @@ -93,10 +95,18 @@ void parse_input_args(char **argv, paths.output_folder_path = std::string(argv[++i]); continue; } + if (!strcmp(argv[i], "-profiling-folder")) { + paths.profiling_folder_path = std::string(argv[++i]); + continue; + } if (!strcmp(argv[i], "--use-full-precision")) { use_full_precision = true; continue; } + if (!strcmp(argv[i], "--warmup")) { + run_warmup = true; + continue; + } // verbose logging to stdout if (!strcmp(argv[i], "--verbose")) { verbose = true; @@ -147,44 +157,6 @@ void parse_input_args(char **argv, wordfree(&p); } -std::vector parse_trace_file(std::string const &trace_file_path) { - using json = nlohmann::json; - std::ifstream file_handle(trace_file_path); - assert(file_handle.good() && "Trace file does not exist."); - nlohmann::ordered_json prompt_json = - nlohmann::ordered_json::parse(file_handle, - /*parser_callback_t */ nullptr, - /*allow_exceptions */ true, - /*ignore_comments */ true); - file_handle.close(); - auto &metadata = prompt_json["metadata"]; - int num_warmup_requests = metadata["num_warmup_requests"]; - int num_regular_requests = 0, total_requests = 0; - std::vector warmup_requests, requests; - for (auto &entry : prompt_json["entries"]) { - int prompt_length = entry["prompt_length"]; - int response_length = entry["response_length"]; - std::string text = entry["prompt"]; - bool is_warmup_request = total_requests < num_warmup_requests; - - Request inference_req; - inference_req.prompt = text; - inference_req.add_special_tokens = false; - inference_req.max_new_tokens = response_length; - - if (is_warmup_request) { - warmup_requests.push_back(inference_req); - } else { - // printf("Prompt[%d]: %s\n", total_requests, text.c_str()); - requests.push_back(inference_req); - num_regular_requests++; - } - - total_requests++; - } - return requests; -} - std::vector make_warmup_requests(int num_inf_request, int num_finetuning_steps, PEFTModelID *peft_model_id) { @@ -228,9 +200,10 @@ void FlexFlow::top_level_task(Task const *task, int max_requests_per_batch = 1; int max_tokens_per_batch = 128; int max_sequence_length = 256; - int max_training_steps = -1; + int max_training_steps = 2; bool enable_peft_finetuning = true; int num_layers_per_finetuning_step = -1; + bool run_warmup = false; InputArgs const &command_args = HighLevelRuntime::get_input_args(); char **argv = command_args.argv; @@ -250,7 +223,8 @@ void FlexFlow::top_level_task(Task const *task, max_tokens_per_batch, max_sequence_length, max_training_steps, - num_layers_per_finetuning_step); + num_layers_per_finetuning_step, + run_warmup); assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree * ffconfig.pipeline_parallelism_degree == ffconfig.numNodes * ffconfig.workersPerNode); @@ -417,16 +391,23 @@ void FlexFlow::top_level_task(Task const *task, rm->start_background_server(&model); // Add PEFT adapter(s) - // PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; - // if (!peft_model_name.empty()) { - // peft_model_id = model.register_peft_adapter(peft_config); - // if (enable_peft_finetuning) { - // peft_model_id_finetuning = - // model.register_peft_adapter(peft_config_finetuning); - // } - // } - PEFTModelID *peft_model_id_finetuning = - model.register_peft_adapter(peft_config_finetuning); + PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr; + if (!peft_model_name.empty() && !enable_peft_finetuning) { + peft_model_id = model.register_peft_adapter(peft_config); + } + if (enable_peft_finetuning) { + peft_model_id_finetuning = + model.register_peft_adapter(peft_config_finetuning); + } + + if (run_warmup) { + std::vector warmup_requests = + make_warmup_requests(10, 1000, peft_model_id_finetuning); + std::vector warmup_result = + model.generate(warmup_requests); + rm->set_inference_finished(false); // reset inference finished flag + std::cout << "----------warmup finished--------------" << std::endl; + } // Run workload { @@ -452,6 +433,23 @@ void FlexFlow::top_level_task(Task const *task, finetuning_req.peft_finetuning_info.max_training_steps = max_training_steps; requests.push_back(finetuning_req); + // Add fine-tuning request + if (enable_peft_finetuning) { + assert(!file_paths.dataset_file_path.empty() && + "Dataset file path is required for fine-tuning."); + printf("Finetuning request with dataset %s\n", + file_paths.dataset_file_path.c_str()); + Request fine_tuning_req; + fine_tuning_req.req_type = RequestType::REQ_FINETUNING; + fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr) + ? *peft_model_id_finetuning + : PEFTModelID::NO_ID; + fine_tuning_req.peft_finetuning_info.dataset_filepath = + file_paths.dataset_file_path; + fine_tuning_req.peft_finetuning_info.max_training_steps = + max_training_steps; + requests.push_back(fine_tuning_req); + } std::vector result = model.generate(requests); } @@ -474,6 +472,32 @@ void FlexFlow::top_level_task(Task const *task, 0.0, // arrival rate 10); // num_warmup_requests + if (!file_paths.profiling_folder_path.empty()) { + std::cout << "Saving profiling info..." << std::endl; + std::string dataset_name; + // set dataset name to "wildchat" if the prompt file path contains + // "wildchat" + if (file_paths.prompt_file_path.find("wildchat") != std::string::npos) { + dataset_name = "wildchat"; + } else if (file_paths.prompt_file_path.find("sharegpt") != + std::string::npos) { + dataset_name = "sharegpt"; + } else { + dataset_name = "unknown"; + } + rm->save_profiling_info_to_csv(file_paths.profiling_folder_path, + dataset_name, + llm_model_name, + model.config.tensor_parallelism_degree, + max_requests_per_batch, + max_tokens_per_batch, + 0.0, // arrival rate + 10); // num_warmup_requests + } + + if (peft_model_id != nullptr) { + free(peft_model_id); + } if (peft_model_id_finetuning != nullptr) { free(peft_model_id_finetuning); } diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py index 8ca2c97e4..d88a2852b 100644 --- a/python/flexflow/core/flexflow_cffi.py +++ b/python/flexflow/core/flexflow_cffi.py @@ -825,7 +825,7 @@ def enable_peft_finetuning(self, value): if type(value) is not bool: raise ValueError("enable_peft_finetuning must be specified as a boolean value") ffc().flexflow_config_set_enable_peft_finetuning(self.handle, value) - + @property def cpu_offload(self): return ffc().flexflow_config_get_offload(self.handle) @@ -1643,6 +1643,15 @@ def set_max_sequence_length(self, max_length): def get_max_sequence_length(self): return ffc().flexflow_request_manager_get_max_sequence_length(self.handle) + + def set_num_transformers_layers(self, num_layers): + return ffc().flexflow_request_manager_set_num_transformers_layers( + self.handle, num_layers + ) + def set_num_layers_per_finetuning_step(self, num_layers): + return ffc().flexflow_request_manager_set_num_layers_per_finetuning_step( + self.handle, num_layers + ) def set_max_finetuning_sequence_length(self, max_length): return ffc().flexflow_request_manager_set_max_finetuning_sequence_length( diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py index 24c5f210b..6c547d295 100644 --- a/python/flexflow/serve/__init__.py +++ b/python/flexflow/serve/__init__.py @@ -90,7 +90,7 @@ def init( - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True - - log_instance_creation: whether to log the creation of FlexFlow instances, defaults to False + - log_instance_cration: whether to log the creation of the FlexFlow instances, defaults to False The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments. @@ -133,7 +133,7 @@ def init( :type inference_debugging: Optional[bool], optional :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True :type fusion: Optional[bool], optional - :param log_instance_cration: whether to log the creation of FlexFlow instances, defaults to False + :param log_instance_cration: whether to log the creation of the FlexFlow instances, defaults to False :type log_instance_cration: Optional[bool], optional :raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index 68b226796..394869426 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -259,17 +259,15 @@ def __get_resource_path( def __is_empty_dir(self, folder: str) -> bool: """Check whether a folder only contains the rev_sha.txt file - Args: folder (str): Path to the folder to check - Returns: bool: True if the folder is empty, False otherwise """ if not os.path.isdir(folder) or not os.path.exists(folder): return True return len(os.listdir(folder)) == 1 and "rev_sha.txt" in os.listdir(folder) - + def __need_cache_refresh( self, model_name: str, resource_type: CachedResourceType ) -> bool: @@ -285,7 +283,6 @@ def __need_cache_refresh( """ resource_path = self.__get_resource_path(model_name, resource_type) ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path) - if self.refresh_cache or not os.path.exists(resource_path) or self.__is_empty_dir(resource_path) or ff_revision != latest_revision: print( f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..." @@ -444,7 +441,6 @@ def compile( max_tokens_per_batch: int = 64, max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - max_finetuning_seq_length: int = -1, num_bwd_layers_per_ft_step: int = -1, ssms: list = [], ): @@ -462,8 +458,6 @@ def compile( :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length) - :type max_finetuning_seq_length: int, optional :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers) :type num_bwd_layers_per_ft_step: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] @@ -494,10 +488,6 @@ def compile( self.rm.set_max_sequence_length(max_seq_length) self.rm.set_max_concurrent_adapters(max_concurrent_adapters) self.rm.set_enable_peft_finetuning(enable_peft_finetuning) - if max_finetuning_seq_length == -1: - self.rm.set_max_finetuning_sequence_length(max_seq_length) - else: - self.rm.set_max_finetuning_sequence_length(max_finetuning_seq_length) self.rm.set_num_transformers_layers(self.hf_config.num_hidden_layers) if num_bwd_layers_per_ft_step != -1: self.rm.set_num_layers_per_finetuning_step(num_bwd_layers_per_ft_step) @@ -771,7 +761,6 @@ def compile( max_tokens_per_batch: int = 2048, max_concurrent_adapters: int = 1, enable_peft_finetuning: bool = False, - max_finetuning_seq_length: int = -1, num_bwd_layers_per_ft_step: int = -1, ssms: list = [], ): @@ -788,8 +777,6 @@ def compile( :type max_concurrent_adapters: int, optional :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False :type enable_peft_finetuning: bool, optional - :param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length) - :type max_finetuning_seq_length: int, optional :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers) :type num_bwd_layers_per_ft_step: int, optional :param ssms: The SSMs to use when operating in speculative inference mode, defaults to [] @@ -802,7 +789,6 @@ def compile( max_tokens_per_batch, max_concurrent_adapters, enable_peft_finetuning, - max_finetuning_seq_length, num_bwd_layers_per_ft_step, ssms, ) diff --git a/requirements.txt b/requirements.txt index 34d0a39bf..87e9b9695 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,12 @@ -cffi>=1.11.0 -numpy>=1.16.0 -qualname>=0.1.0 -keras_preprocessing>=1.1.2 -Pillow +cffi +numpy cmake-build-extension ninja requests regex -torch>=1.13.1 -torchaudio>=0.13.1 -torchvision>=0.14.1 -onnx +torch +torchaudio +torchvision transformers>=4.47.1 sentencepiece einops diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc index 406513a85..55d19763c 100644 --- a/src/c/flexflow_c.cc +++ b/src/c/flexflow_c.cc @@ -181,7 +181,8 @@ bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); return handle->enable_peft_finetuning; } -void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value) { +void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, + bool value) { FFConfig *handle = FFCObjectWrapper::unwrap(handle_); handle->enable_peft_finetuning = value; } @@ -2818,7 +2819,7 @@ void flexflow_request_manager_set_num_transformers_layers( RequestManager *handle = FFCObjectWrapper::unwrap(handle_); handle->set_num_transformer_layers(num_transformers_layers_); DEBUG_PRINT("[RequestManager] set num_transformers_layers %d", - num_transformers_layers_); + num_transformers_layers_); } void flexflow_request_manager_set_num_layers_per_finetuning_step( @@ -2826,7 +2827,7 @@ void flexflow_request_manager_set_num_layers_per_finetuning_step( RequestManager *handle = FFCObjectWrapper::unwrap(handle_); handle->set_num_layers_per_finetuning_step(num_layers_per_finetuning_step_); DEBUG_PRINT("[RequestManager] set num layers per finetuning step %d", - num_layers_per_finetuning_step_); + num_layers_per_finetuning_step_); } void flexflow_request_manager_register_tokenizer( diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp index b58b6a45c..550a30d1b 100644 --- a/src/ops/add_bias_residual_layer_norm.cpp +++ b/src/ops/add_bias_residual_layer_norm.cpp @@ -41,10 +41,9 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -222,7 +221,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu index de4357a43..16629f493 100644 --- a/src/ops/add_bias_residual_layer_norm.cu +++ b/src/ops/add_bias_residual_layer_norm.cu @@ -40,10 +40,9 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta( DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 + allocated_peft_buffer_size; @@ -219,7 +218,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper( int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; @@ -809,4 +808,4 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper( } } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp index 50ba64d8c..9c8ba7c5f 100644 --- a/src/ops/argmax.cpp +++ b/src/ops/argmax.cpp @@ -560,4 +560,4 @@ ArgMaxMeta::~ArgMaxMeta(void) { reserveInst.destroy(); } } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu index 12bfa6052..e3b22af0d 100644 --- a/src/ops/argmax.cu +++ b/src/ops/argmax.cu @@ -275,4 +275,4 @@ ArgMaxMeta::~ArgMaxMeta(void) { reserveInst.destroy(); } } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp index 7b6f4454a..a57ff2782 100644 --- a/src/ops/fused.cpp +++ b/src/ops/fused.cpp @@ -97,11 +97,8 @@ __host__ void assert(metas->numOperators == fused->numOperators); assert(regions.size() == task->regions.size()); - bool softmax_grad_additional_region = - (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX); - assert((int)regions.size() == fused->numInputs + fused->numWeights + - fused->numOutputs + - softmax_grad_additional_region); + assert((int)regions.size() == + fused->numInputs + fused->numWeights + fused->numOutputs); GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS]; GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS]; GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS]; @@ -589,23 +586,13 @@ __host__ void assert(fused->op_num_outputs[op] == 1); assert(my_input_accessor[0].domain.get_volume() == my_output_accessor[0].domain.get_volume()); - if (op == fused->numOperators - 1) { // if this is the final operator - output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO( - fused->output_data_types[fused->numOutputs - 1], - regions[roff], - task->regions[roff], - FID_DATA, - ctx, - runtime); - } SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; Kernels::Softmax::inference_kernel_wrapper( m, bc, (op == fused->numOperators - 1), my_input_accessor[0], - my_output_accessor[0], - output_accessor[fused->numOutputs]); + my_output_accessor[0]); break; } case OP_ALLREDUCE: { @@ -680,7 +667,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); // Return if no active PEFT bwd tokens if (bc->num_finetuning_bwd_tokens() == 0) { - return; + return false; } assert(metas->numOperators == fused->numOperators); @@ -790,38 +777,6 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, my_output_grad_accessor[i] = output_grad_accessor[my_off]; } switch (fused->op_op_type[op]) { - case OP_CONCAT: { - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - // TODO: implement this - assert(false); - // ConcatMeta *m = (ConcatMeta *)metas->meta[op]; - // int num_inputs = fused->op_num_inputs[op]; - // Kernels::Concat::peft_bwd_kernel_wrapper(m, - // my_output_accessor[0], - // my_input_accessor, - // num_inputs, - // m->legion_axis); - break; - } - case OP_BATCHNORM: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain.get_dim() == 5); - assert(my_output_grad_accessor[0].domain.get_dim() == 5); - assert(my_weight_accessor[0].domain.get_dim() == 2); - assert(my_weight_accessor[1].domain.get_dim() == 2); - // TODO: implement this - assert(false); - // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op]; - // BatchNorm::peft_bwd_kernel_kernel( - // m, - // my_input_accessor[0].get_float_ptr(), - // my_output_accessor[0].get_float_ptr(), - // my_weight_accessor[0].get_float_ptr(), - // my_weight_accessor[1].get_float_ptr()); - break; - } case OP_LINEAR: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); @@ -836,7 +791,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, LinearMeta *m = (LinearMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } assert(m->input_type[0] == my_input_grad_accessor[0].data_type); assert(m->input_type[0] == my_output_grad_accessor[0].data_type); @@ -848,9 +803,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, my_output_grad_accessor[0].ptr, my_weight_accessor[0].ptr, in_dim, - out_dim, - num_infr_tokens, - num_peft_tokens); + out_dim); break; } case OP_LORA: { @@ -868,7 +821,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } assert(m->input_type[0] == my_input_grad_accessor[0].data_type); assert(m->output_type[0] == my_output_grad_accessor[0].data_type); @@ -884,46 +837,8 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, shard_id, my_input_grad_accessor[0], my_output_grad_accessor[0]); - break; - } - case OP_BATCHMATMUL: { - assert(fused->op_num_inputs[op] == 2); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - Domain out_domain = my_output_grad_accessor[0].domain; - Domain a_domain = my_input_grad_accessor[0].domain; - Domain b_domain = my_input_grad_accessor[1].domain; - int m = b_domain.hi()[0] - b_domain.lo()[0] + 1; - assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1); - int n = a_domain.hi()[1] - a_domain.lo()[1] + 1; - assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1); - int k = a_domain.hi()[0] - a_domain.lo()[0] + 1; - assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1); - assert(a_domain.get_dim() == b_domain.get_dim()); - assert(a_domain.get_dim() == out_domain.get_dim()); - int batch = 1; - for (int i = 2; i < a_domain.get_dim(); i++) { - int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1; - assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1); - assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1); - batch *= dim_size; - } - // TODO: implement me - assert(false); - // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op]; - // Kernels::BatchMatmul::backward_kernel_wrapper( - // meta, - // my_output_accessor[0].get_float_ptr(), - // my_input_accessor[0].get_float_ptr(), - // my_input_accessor[1].get_float_ptr(), - // (float const *)nullptr, - // m, - // n, - // k, - // batch, - // meta->a_seq_length_dim, - // meta->b_seq_length_dim, - // fused->iter_config.seq_length); + Kernels::LoraLinear::save_peft_weights_if_needed( + m, bc, in_dim, out_dim, shard_id); break; } case OP_EW_ADD: @@ -951,37 +866,6 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, // so we do nothing for embedding break; } - case OP_GELU: - case OP_RELU: - case OP_SIGMOID: - case OP_TANH: - case OP_ELU: - case OP_SCALAR_TRUE_DIV: { - assert(fused->op_num_inputs[op] == 1); - assert(fused->op_num_weights[op] == 0); - assert(fused->op_num_outputs[op] == 1); - assert(my_input_grad_accessor[0].domain == - my_output_grad_accessor[0].domain); - // TODO: implement me - assert(false); - // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op]; - // if (m->data_type == DT_HALF) { - // ElementUnary::forward_kernel_wrapper( - // m, - // my_input_accessor[0].get_half_ptr(), - // my_output_accessor[0].get_half_ptr(), - // my_input_accessor[0].domain.get_volume()); - // } else if (m->data_type == DT_FLOAT) { - // ElementUnary::forward_kernel_wrapper( - // m, - // my_input_accessor[0].get_float_ptr(), - // my_output_accessor[0].get_float_ptr(), - // my_input_accessor[0].domain.get_volume()); - // } else { - // assert(false && "Unsupported data type in ElementUnary forward"); - // } - break; - } case OP_RMS_NORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_weights[op] == 1); @@ -989,7 +873,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } Kernels::RMSNorm::peft_bwd_kernel_wrapper(m, bc, @@ -1005,7 +889,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper( m, @@ -1024,7 +908,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, (IncMultiHeadSelfAttentionMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } assert(fused->op_num_weights[op] == 0); GenericTensorAccessorR biases; @@ -1037,19 +921,13 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, // biases); break; } - case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: - case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: { - // TODO: implement me - assert(false); - break; - } case OP_LAYERNORM: { assert(fused->op_num_inputs[op] == 1); assert(fused->op_num_outputs[op] == 1); LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } if (m->elementwise_affine) { assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias)); @@ -1071,7 +949,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, (ResidualLayerNormMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } if (m->use_two_residuals) { assert(fused->op_num_inputs[op] == 3); @@ -1110,7 +988,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, (AddBiasResidualLayerNormMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } if (!m->elementwise_affine) { assert(fused->op_num_weights[op] == 1); // attn bias @@ -1140,7 +1018,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } SigmoidSiluMulti::peft_bwd_kernel_wrapper(m, bc, @@ -1158,10 +1036,10 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } Kernels::Softmax::peft_bwd_kernel_wrapper( - m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); + m, bc, my_input_grad_accessor[0]); break; } case OP_ALLREDUCE: { @@ -1170,7 +1048,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } Kernels::AllReduce::peft_bwd_kernel_wrapper( m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]); @@ -1182,7 +1060,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op]; if (!bc->peft_bwd_applies_to_this_layer( m->layer_guid.transformer_layer_id)) { - return; + break; } runtime->concurrent_task_barrier(ctx); Kernels::ParallelIdentity::peft_bwd_kernel_wrapper( @@ -1221,13 +1099,16 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task, } assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; - FusedOp::save_inference_tensors_to_file(metas->meta[op], - shard_id, - bc, - input_accessors_to_save, - weight_accessors_to_save, - output_accessors_to_save, - false); + if (bc->peft_bwd_applies_to_this_layer( + metas->meta[op]->layer_guid.transformer_layer_id)) { + FusedOp::save_inference_tensors_to_file(metas->meta[op], + shard_id, + bc, + input_accessors_to_save, + weight_accessors_to_save, + output_accessors_to_save, + false); + } } } return true; diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp index 8dd40c0cb..1ac58afda 100644 --- a/src/ops/inc_multihead_self_attention.cpp +++ b/src/ops/inc_multihead_self_attention.cpp @@ -116,14 +116,14 @@ __global__ void fill_entries_above_diagonal(DT *matrix, } } -bool is_inf_req_decoding_mode(BatchConfig const *bc, int i) { - return !bc->requestsInfo[i].finetuning_request && - !bc->requestsInfo[i].prompt_phase; +bool is_finetuning_bwd_request(BatchConfig const *bc, int request_id) { + return bc->requestsInfo[request_id].finetuning_request && + bc->requestsInfo[request_id].finetuning_backward_phase; } -bool is_finetuning_req_bwd_phase(BatchConfig const *bc, int i) { - return bc->requestsInfo[i].finetuning_request && - bc->requestsInfo[i].finetuning_backward_phase; +bool is_decoding_request(BatchConfig const *bc, int request_id) { + return !bc->requestsInfo[request_id].finetuning_request && + !bc->requestsInfo[request_id].prompt_phase; } template @@ -150,29 +150,37 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, assert(m->qProjSize == m->kProjSize); for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase && - !bc->requestsInfo[i].finetuning_request)) { + if (bc->request_completed[i] || is_decoding_request(bc, i) || + is_finetuning_bwd_request(bc, i)) { continue; } int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch; int total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; + if (num_new_tokens <= 0) { + continue; + } // Copy query to m->query_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].finetuning_request && !bc->requestsInfo[i].finetuning_backward_phase) { - int max_peft_tokens = BatchConfig::max_finetuning_sequence_length(); - // Check that we have at most one request that requires peft_bwd - assert(bc->num_finetuning_fwd_requests() == 1); - assert(bc->num_finetuning_bwd_requests() == 1); - assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); - assert(!is_finetuning_req_bwd_phase(bc, i)); - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); + // int max_peft_tokens = bc->requestsInfo[i].max_length; + int max_peft_tokens = BatchConfig::max_sequence_length(); size_t activation_size_needed = sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize; - assert(m->allocated_peft_buffer_size1 == activation_size_needed); - + if (activation_size_needed != m->allocated_peft_buffer_size1) { + std::cout << "activation_size_needed: " << activation_size_needed + << std::endl; + std::cout << "m->allocated_peft_buffer_size1: " + << m->allocated_peft_buffer_size1 << std::endl; + std::cout << "max_peft_tokens: " << max_peft_tokens << std::endl; + std::cout << "m->num_q_heads: " << m->num_q_heads << std::endl; + std::cout << "m->qProjSize: " << m->qProjSize << std::endl; + std::cout << "BatchConfig::max_sequence_length()" + << BatchConfig::max_sequence_length() << std::endl; + std::cout << "sizeof(DT)" << sizeof(DT) << std::endl; + } + assert(activation_size_needed == m->allocated_peft_buffer_size1); int parallelism = m->hidden_size * num_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache), GET_BLOCKS(parallelism), @@ -317,18 +325,11 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, // Copy C_softmax to m->softmax_activation_buffer if we need to compute // PEFT backward if (bc->requestsInfo[i].finetuning_request) { - int max_peft_tokens = BatchConfig::max_finetuning_sequence_length(); - // Check that we have at most one request that requires peft_bwd - assert(bc->num_finetuning_fwd_requests() == 1); - assert(bc->num_finetuning_bwd_requests() == 0); - assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); - assert(!is_finetuning_req_bwd_phase(bc, i)); - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); + int max_peft_tokens = BatchConfig::max_sequence_length(); + DT *C_softmax = static_cast
(m->qk_prods_softmax); size_t activation_size_needed = sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads; assert(activation_size_needed == m->allocated_peft_buffer_size2); - DT *C_softmax = static_cast
(m->qk_prods_softmax); checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer, C_softmax, sizeof(DT) * total_tokens * num_new_tokens * @@ -336,6 +337,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m, hipMemcpyDeviceToDevice, stream)); } + // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @ // softmax(QK.T/sqrt(d_k)).T { @@ -993,11 +995,11 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); - hipMemcpyAsync(m->devQKVProjArray, - qkv_ptr, - qkv_proj_size * sizeof(DT), - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync(m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), + hipMemcpyDeviceToDevice, + stream)); // phase 1: Implement kernel to apply rotary embedding and scaling apply_scaling_and_rotary( @@ -1015,13 +1017,27 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m, compute_attention_kernel_prompt
(m, bc, shard_id, stream); } - // compute output production and bias together for all tokens + if (bc->num_finetuning_fwd_tokens() > 0) { + assert(m->peft_token_infos != nullptr); + assert(m->peft_token_infos_size == sizeof(BatchConfig::PerTokenInfo) * + BatchConfig::max_sequence_length()); + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); + int i = bc->finetuning_request_index(); + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; + int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request; + for (int j = 0; j < num_ft_tokens; j++) { + m->peft_token_infos[prev_steps_tokens + j] = + bc->tokensInfo[tokens_previous_requests + j]; + } + } + int num_tokens = bc->num_active_tokens(); - hipMemcpyAsync(output_ptr, - m->attn_heads, - m->oProjSize * num_tokens * sizeof(DT), - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + hipMemcpyDeviceToDevice, + stream)); } std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m, @@ -1120,32 +1136,33 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, assert( bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)); int i = bc->finetuning_request_index(); - int num_tokens = bc->requestsInfo[i].num_tokens_in_batch; int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request + bc->requestsInfo[i].num_tokens_in_batch; // Currently assume we are calculating gradients for all tokens // of a request assert(num_tokens == num_total_tokens); + assert(num_total_tokens == bc->requestsInfo[i].max_length); + assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); int kt_block_size = m->kProjSize; - int kt_req_block_size = kt_block_size * m->num_q_heads * - BatchConfig::max_finetuning_sequence_length(); + int kt_req_block_size = + kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); int vt_block_size = m->vProjSize; - int vt_req_block_size = vt_block_size * m->num_q_heads * - BatchConfig::max_finetuning_sequence_length(); - assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize); + int vt_req_block_size = + vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length(); + // Step 1: copy gradient before final projection into workspace { int m_ = m->vProjSize * m->num_q_heads; int n_ = num_tokens; DT *C = static_cast
(m->handle.workSpace); - hipMemcpyAsync(C, - output_grad_ptr + - bc->requestsInfo[i].first_token_offset_in_batch * - m->oProjSize, - m_ * n_ * sizeof(DT), - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync( + C, + output_grad_ptr + + bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize, + m_ * n_ * sizeof(DT), + hipMemcpyDeviceToDevice, + stream)); if (m->inference_debugging) { // save result to file for checking std::string filename = @@ -1446,6 +1463,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, // Step 7: perform rotary position embeddings (RoPE) bwd { if (m->rotary_embedding_meta->apply_rotary_embedding) { + checkCUDA(hipMemcpyAsync(m->peft_token_infos_device, + m->peft_token_infos, + m->peft_token_infos_size, + hipMemcpyHostToDevice, + stream)); assert(m->hidden_size == m->qProjSize * m->num_q_heads); assert(m->qProjSize == m->kProjSize); /*q&k*/ @@ -1459,7 +1481,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, stream, A, m->complex_input, - m->token_infos, + m->peft_token_infos_device, m->rotary_embedding_meta->rope_theta, (m->rotary_embedding_meta->rope_type == "llama3"), m->rotary_embedding_meta->factor, @@ -1752,24 +1774,34 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads + kProjSize * num_q_heads)) / 2; - allocated_peft_buffer_size1 = - enable_peft_finetuning - ? (BatchConfig::max_finetuning_sequence_length() * num_q_heads * - qProjSize * size_of_dt) - : 0; - allocated_peft_buffer_size2 = - enable_peft_finetuning - ? (BatchConfig::max_finetuning_sequence_length() * - BatchConfig::max_finetuning_sequence_length() * num_q_heads * - size_of_dt) - : 0; + if (enable_peft_finetuning) { + allocated_peft_buffer_size1 = BatchConfig::max_sequence_length() * + num_q_heads * qProjSize * size_of_dt; + allocated_peft_buffer_size2 = BatchConfig::max_sequence_length() * + BatchConfig::max_sequence_length() * + num_q_heads * size_of_dt; + peft_token_infos = (BatchConfig::PerTokenInfo *)calloc( + 1, + sizeof(BatchConfig::PerTokenInfo) * + BatchConfig::max_sequence_length()); + peft_token_infos_size = sizeof(BatchConfig::PerTokenInfo) * + BatchConfig::max_sequence_length(); + } else { + allocated_peft_buffer_size1 = 0; + allocated_peft_buffer_size2 = 0; + peft_token_infos = nullptr; + peft_token_infos_size = 0; + } size_t totalSize = (qkv_max_proj_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size) * size_of_dt + complex_size * sizeof(hipFloatComplex); // more components will // be added here later - totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2; + if (enable_peft_finetuning) { + totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2; + totalSize += peft_token_infos_size; + } if (offload) { // assert that we have enough reserved work space left size_t totalSharedSize = @@ -1841,6 +1873,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( allocated_peft_buffer_size1); softmax_activation_buffer = gpu_mem_allocator.allocate_instance_untyped( allocated_peft_buffer_size2); + peft_token_infos_device = (BatchConfig::PerTokenInfo *) + gpu_mem_allocator.allocate_instance_untyped( + peft_token_infos_size); } // allocate more size for quantization data @@ -1852,8 +1887,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( gpu_mem_allocator.reserved_allocated_size); } } - allocated_peft_buffer_size1 = 0; - allocated_peft_buffer_size2 = 0; + checkCUDA(hipStreamSynchronize(stream)); } diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index 35dd27d36..c2aebcac6 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -1927,10 +1927,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, } // Step 7: perform rotary position embeddings (RoPE) bwd - // todo: first sum the gradients wrt each q_head to obtain the gradients wrt - // each key head { if (m->rotary_embedding_meta->apply_rotary_embedding) { + checkCUDA(cudaMemcpyAsync(m->peft_token_infos_device, + m->peft_token_infos, + m->peft_token_infos_size, + cudaMemcpyHostToDevice, + stream)); assert(m->hidden_size == m->qProjSize * m->num_q_heads); assert(m->qProjSize == m->kProjSize); /*q&k*/ @@ -1942,7 +1945,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m, stream>>>( A, m->complex_input, - m->token_infos, + m->peft_token_infos_device, m->rotary_embedding_meta->rope_theta, (m->rotary_embedding_meta->rope_type == "llama3"), m->rotary_embedding_meta->factor, @@ -2257,15 +2260,23 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 2; if (enable_peft_finetuning) { allocated_peft_buffer_size1 = - BatchConfig::max_finetuning_sequence_length() * num_q_heads * + BatchConfig::max_sequence_length() * num_q_heads * qProjSize * size_of_dt; allocated_peft_buffer_size2 = - BatchConfig::max_finetuning_sequence_length() * - BatchConfig::max_finetuning_sequence_length() * num_q_heads * + BatchConfig::max_sequence_length() * + BatchConfig::max_sequence_length() * num_q_heads * size_of_dt; + peft_token_infos = (BatchConfig::PerTokenInfo *)calloc( + 1, + sizeof(BatchConfig::PerTokenInfo) * + BatchConfig::max_sequence_length()); + peft_token_infos_size = sizeof(BatchConfig::PerTokenInfo) * + BatchConfig::max_sequence_length(); } else { allocated_peft_buffer_size1 = 0; allocated_peft_buffer_size2 = 0; + peft_token_infos = nullptr; + peft_token_infos_size = 0; } size_t totalSize = (qkv_max_proj_size + query_tmp_size + key_cache_size + value_cache_size + 2 * qk_prod_size + attn_heads_size + @@ -2275,6 +2286,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( 3 * gqa_ptr_array_size; if (enable_peft_finetuning) { totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2; + totalSize += peft_token_infos_size; totalSize += 3 * gqa_ptr_array_size; } if (offload) { @@ -2382,6 +2394,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta( allocated_peft_buffer_size1); softmax_activation_buffer = gpu_mem_allocator.allocate_instance_untyped( allocated_peft_buffer_size2); + peft_token_infos_device = (BatchConfig::PerTokenInfo *) + gpu_mem_allocator.allocate_instance_untyped( + peft_token_infos_size); } // allocate more size for quantization data diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp index 2111dff50..1d46dcf90 100644 --- a/src/ops/kernels/linear_kernels.cpp +++ b/src/ops/kernels/linear_kernels.cpp @@ -40,11 +40,24 @@ LinearMeta::LinearMeta(FFHandler handler, gpu_mem_allocator.allocate_reserved(quantized_weightSize); } } - // Allocate an all-one's vector + // peft activation + size_t out_dim = + li->outputs[0]->dims[0].size / li->outputs[0]->dims[0].degree; + allocated_peft_buffer_size = + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * out_dim) + : 0; + size_t totalSize = + data_type_size(data_type) * batch_size + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( - reserveInst, data_type_size(data_type) * batch_size, "LinearMeta"); + reserveInst, totalSize, "LinearMeta"); + // Allocate an all-one's vector one_ptr = gpu_mem_allocator.allocate_instance_untyped( data_type_size(data_type) * batch_size); + if (enable_peft_finetuning) { + output_activation_buffer = + gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size); + } int parallelism = batch_size; hipStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -211,6 +224,11 @@ void inference_kernel_wrapper(LinearMeta *m, out_dim, batch_size, stream); + if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && + bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, out_dim, static_cast(output_ptr), stream); + } } else if (m->input_type[0] == DT_HALF) { Internal::forward_kernel(m, input_ptr, @@ -221,41 +239,10 @@ void inference_kernel_wrapper(LinearMeta *m, out_dim, batch_size, stream); - } - - if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) { - // save input activation if needed for PEFT - if (bc->num_finetuning_fwd_requests() > 0) { - // Check that we have at most one request that requires peft_bwd - assert(bc->num_finetuning_fwd_tokens() >= 1); - int i = bc->finetuning_request_index(); - assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); - assert(!bc->requestsInfo[i].finetuning_backward_phase); - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - assert(m->allocated_peft_buffer_size == - data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - // copy input activation - if (m->input_type[0] == DT_FLOAT) { - checkCUDA(hipMemcpyAsync( - m->input_activation, - added_output.get_float_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - hipMemcpyDeviceToDevice, - stream)); - } else if (m->input_type[0] == DT_HALF) { - checkCUDA(hipMemcpyAsync( - m->input_activation, - added_output.get_half_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - hipMemcpyDeviceToDevice, - stream)); - } else { - assert(false && "unsupport datatype in layernorm"); - } + if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && + bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, out_dim, static_cast(output_ptr), stream); } } @@ -276,9 +263,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, void *output_grad_ptr, void const *weight_ptr, int in_dim, - int out_dim, - int num_infr_tokens, - int num_peft_tokens) { + int out_dim) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; @@ -295,8 +280,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, weight_ptr, in_dim, out_dim, - num_infr_tokens, - num_peft_tokens, stream); } else if (m->input_type[0] == DT_HALF) { Internal::peft_bwd_kernel(m, @@ -306,8 +289,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m, weight_ptr, in_dim, out_dim, - num_infr_tokens, - num_peft_tokens, stream); } @@ -561,6 +542,34 @@ void forward_kernel(LinearMeta const *m, } } +template +void store_peft_activations(LinearMeta const *m, + BatchConfig const *bc, + size_t out_dim, + DT *output_ptr, + hipStream_t stream) { + int i = bc->finetuning_request_index(); + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; + int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch; + size_t data_size = out_dim * num_ft_tokens * sizeof(DT); + size_t batch_offset = out_dim * tokens_previous_requests; + size_t request_offset = out_dim * tokens_previous_steps; + assert(bc->num_finetuning_fwd_tokens() >= 1); + assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); + assert(!bc->requestsInfo[i].finetuning_backward_phase); + assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); + assert(m->allocated_peft_buffer_size >= data_size); + + checkCUDA(hipMemcpyAsync(static_cast
(m->output_activation_buffer) + + request_offset, + output_ptr + batch_offset, + data_size, + hipMemcpyDeviceToDevice, + stream)); +} + template void peft_bwd_kernel(LinearMeta const *m, BatchConfig const *bc, @@ -569,21 +578,22 @@ void peft_bwd_kernel(LinearMeta const *m, void const *kernel_ptr, int in_dim, int out_dim, - int num_infr_tokens, - int num_peft_tokens, ffStream_t stream) { checkCUDA(hipblasSetStream(m->handle.blas, stream)); checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + assert( + bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)); + int i = bc->finetuning_request_index(); + int num_peft_tokens = bc->num_finetuning_bwd_tokens(); + assert(bc->num_finetuning_bwd_requests() == 1); + hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]); hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]); - // update input_grad_ptr and output_grad_ptr offset - int num_infr_only_tokens = num_infr_tokens - num_peft_tokens; - input_grad_ptr = - static_cast
(input_grad_ptr) + num_infr_only_tokens * in_dim; - output_grad_ptr = - static_cast
(output_grad_ptr) + num_infr_only_tokens * out_dim; + input_grad_ptr = static_cast
(input_grad_ptr); + output_grad_ptr = static_cast
(output_grad_ptr); + hipblasDatatype_t compute_type = output_type; int output_size = out_dim * num_peft_tokens; if (m->activation == AC_MODE_RELU) { @@ -607,31 +617,13 @@ void peft_bwd_kernel(LinearMeta const *m, // NOTE: we use beta=1 for input_grad to accumulate gradients when needed DT alpha = 1.0f; DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; - - // ensure that we only have one finetuning request, with a single lora - int num_peft_requests = 0; - bool lora_applies = false; - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - if (bc->request_completed[i] || - bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || - !bc->requestsInfo[i].peft_bwd) { - continue; - } - num_peft_requests++; - std::string peft_model_config_str = - std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); - if (!lora_applies_to_this_layer(m, lora_config)) { - continue; - } - lora_applies = true; - } - assert(num_peft_requests == 1 && - "Exactly one PEFT finetuning request is required"); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + bool lora_applies = lora_applies_to_this_layer(m, lora_config); // if the request does not have any active lora in the current layer, reset - // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << " - // lora_applies: " << lora_applies << std::endl; + // beta to 0 if (lora_applies) { beta = 1.0f; } diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu index 0df5ab723..b0d76a958 100644 --- a/src/ops/kernels/linear_kernels.cu +++ b/src/ops/kernels/linear_kernels.cu @@ -43,10 +43,9 @@ LinearMeta::LinearMeta(FFHandler handler, size_t out_dim = li->outputs[0]->dims[0].size / li->outputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * out_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * out_dim) + : 0; size_t totalSize = data_type_size(data_type) * batch_size + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -226,8 +225,10 @@ void inference_kernel_wrapper(LinearMeta *m, out_dim, batch_size, stream); - if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && bc->num_finetuning_fwd_requests() > 0) { - Internal::store_peft_activations(m, bc, out_dim, static_cast(output_ptr), stream); + if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && + bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, out_dim, static_cast(output_ptr), stream); } } else if (m->input_type[0] == DT_HALF) { Internal::forward_kernel(m, @@ -239,8 +240,10 @@ void inference_kernel_wrapper(LinearMeta *m, out_dim, batch_size, stream); - if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && bc->num_finetuning_fwd_requests() > 0) { - Internal::store_peft_activations(m, bc, out_dim, static_cast(output_ptr), stream); + if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && + bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, out_dim, static_cast(output_ptr), stream); } } @@ -537,7 +540,8 @@ void store_peft_activations(LinearMeta const *m, cudaStream_t stream) { int i = bc->finetuning_request_index(); int num_ft_tokens = bc->num_finetuning_fwd_tokens(); - int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch; + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch; size_t data_size = out_dim * num_ft_tokens * sizeof(DT); size_t batch_offset = out_dim * tokens_previous_requests; @@ -547,8 +551,9 @@ void store_peft_activations(LinearMeta const *m, assert(!bc->requestsInfo[i].finetuning_backward_phase); assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); assert(m->allocated_peft_buffer_size >= data_size); - - checkCUDA(cudaMemcpyAsync(static_cast(m->output_activation_buffer) + request_offset, + + checkCUDA(cudaMemcpyAsync(static_cast
(m->output_activation_buffer) + + request_offset, output_ptr + batch_offset, data_size, cudaMemcpyDeviceToDevice, @@ -576,7 +581,6 @@ void peft_bwd_kernel(LinearMeta const *m, cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]); cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]); cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]); - input_grad_ptr = static_cast
(input_grad_ptr); output_grad_ptr = static_cast
(output_grad_ptr); cudaDataType_t compute_type = output_type; diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp index 2837ad044..cadde8233 100644 --- a/src/ops/kernels/lora_linear_kernels.cpp +++ b/src/ops/kernels/lora_linear_kernels.cpp @@ -194,7 +194,7 @@ void inference_kernel(LoraLinearMeta *m, bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) { continue; } - if (bc->requestsInfo[i].peft_bwd) { + if (bc->requestsInfo[i].finetuning_request) { num_peft_requests++; } std::string peft_model_config_str = @@ -206,17 +206,18 @@ void inference_kernel(LoraLinearMeta *m, } // std::cout << "Lora layer activated!" << std::endl; // std::cout << "Lora Config: " << peft_model_config_str << std::endl; - assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && + assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request && "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + // assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); // int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; LoraLinearWeight weight = m->peft_memory_manager->get_peft( bc->requestsInfo[i].peft_model_id, lora_config); - void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd) + void *intermediate_result_ptr = (bc->requestsInfo[i].finetuning_request) ? weight.low_rank_activation : m->handle.workSpace; - if (bc->requestsInfo[i].peft_bwd) { + if (bc->requestsInfo[i].finetuning_request) { checkCUDA(hipMemcpyAsync(weight.input_activation, input_ptr + first_token_offset * in_dim, data_type_size(m->input_type[0]) * @@ -322,151 +323,50 @@ void peft_bwd_kernel(Context ctx, hipblasDatatype_t lr_actv_type = output_type; hipblasDatatype_t compute_type = output_type; - for (int i = 0; i < bc->max_requests_per_batch(); i++) { - // Skip completed, non-PEFT and PEFT forward-only requests - if (bc->request_completed[i] || - bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID || - !bc->requestsInfo[i].peft_bwd) { - continue; - } - std::string peft_model_config_str = - std::string(bc->requestsInfo[i].peft_model_config_str); - LoraLinearConfig lora_config = - LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); - if (!lora_applies_to_this_layer(m, lora_config)) { - continue; - } - // std::cout << "Lora layer activated!" << std::endl; - // std::cout << "Lora Config: " << peft_model_config_str << std::endl; - assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd && - "Trainable flag mismatch"); - m->peft_memory_manager->check_ft_model_id( - bc->requestsInfo[i].peft_model_id); - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - // int max_peft_tokens = bc->requestsInfo[i].max_length; - // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - LoraLinearWeight weight = m->peft_memory_manager->get_peft( - bc->requestsInfo[i].peft_model_id, lora_config); - DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); - - // Compute LORA_B weight's gradient - if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { - DT alpha = 1.0f; - DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) - ? 0.0f - : 1.0f; - // std::cout << "Lora B gradient computation, beta = " << (float) beta << - // std::endl; - if (m->inference_debugging) { - // save result to file for checking - std::string filename = - get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; - std::cout << "Save low_rank_activation (" << lora_config.rank << ", " - << num_peft_tokens << ") to " << filename << std::endl; - save_tensor(static_cast(weight.low_rank_activation), - lora_config.rank * num_peft_tokens, - filename.c_str()); - } - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - lora_config.rank, - out_dim, - num_peft_tokens, - &scaling_constant, - weight.low_rank_activation, - lr_actv_type, - lora_config.rank, - output_grad_ptr, - output_type, - out_dim, - &beta, - weight.w1_grad_ptr, - weight_type, - lora_config.rank, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - - // Compute LORA_B input's (and LORA_A output's) gradient inplace in - // low_rank_activation - { - DT alpha = 1.0f, beta = 0.0f; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_N, - lora_config.rank, - num_peft_tokens, - out_dim, - &scaling_constant, - weight.w1_ptr, - weight_type, - lora_config.rank, - output_grad_ptr, - output_type, - out_dim, - &beta, - weight.low_rank_activation, - lr_actv_type, - lora_config.rank, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } + assert( + bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)); + int i = bc->finetuning_request_index(); + std::string peft_model_config_str = + std::string(bc->requestsInfo[i].peft_model_config_str); + LoraLinearConfig lora_config = + LoraLinearConfig::deserialize_from_json_string(peft_model_config_str); + if (!lora_applies_to_this_layer(m, lora_config)) { + return; + } + // std::cout << "Lora layer activated!" << std::endl; + // std::cout << "Lora Config: " << peft_model_config_str << std::endl; + assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request && + "Trainable flag mismatch"); + m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id); + int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; + assert(num_peft_tokens == bc->num_finetuning_bwd_tokens()); + // int max_peft_tokens = bc->requestsInfo[i].max_length; + // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; + LoraLinearWeight weight = m->peft_memory_manager->get_peft( + bc->requestsInfo[i].peft_model_id, lora_config); + DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank); - // Compute LORA_A weight's gradient - if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { - DT alpha = 1.0f; - DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) - ? 0.0f - : 1.0f; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_T, - in_dim, - lora_config.rank, - num_peft_tokens, - &alpha, - weight.input_activation, - input_type, - in_dim, - weight.low_rank_activation, - lr_actv_type, - lora_config.rank, - &beta, - weight.w0_grad_ptr, - weight_type, - in_dim, - compute_type, - HIPBLAS_GEMM_DEFAULT)); - } - // Compute input gradient - // NOTE: we use beta=1 for input_grad to accumulate gradients when needed - if (input_grad_ptr != nullptr) { - DT alpha = 1.0f; - DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; - checkCUDA(hipblasGemmEx(m->handle.blas, - HIPBLAS_OP_N, - HIPBLAS_OP_N, - in_dim, - num_peft_tokens, - lora_config.rank, - &alpha, - weight.w0_ptr, - weight_type, - in_dim, - weight.low_rank_activation, - lr_actv_type, - lora_config.rank, - &beta, - input_grad_ptr, - input_type, - in_dim, - compute_type, - HIPBLAS_GEMM_DEFAULT)); + // Compute LORA_B weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + // std::cout << "Lora B gradient computation, beta = " << (float) beta << + // std::endl; + if (m->inference_debugging) { + // save result to file for checking + std::string filename = + get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation"; + std::cout << "Save low_rank_activation (" << lora_config.rank << ", " + << num_peft_tokens << ") to " << filename << std::endl; + save_tensor(static_cast(weight.low_rank_activation), + lora_config.rank * num_peft_tokens, + filename.c_str()); } checkCUDA(hipblasGemmEx(m->handle.blas, - CUBLAS_OP_N, - CUBLAS_OP_T, + HIPBLAS_OP_N, + HIPBLAS_OP_T, lora_config.rank, out_dim, num_peft_tokens, @@ -485,6 +385,83 @@ void peft_bwd_kernel(Context ctx, HIPBLAS_GEMM_DEFAULT)); } + // Compute LORA_B input's (and LORA_A output's) gradient inplace in + // low_rank_activation + { + DT alpha = 1.0f, beta = 0.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + lora_config.rank, + num_peft_tokens, + out_dim, + &scaling_constant, + weight.w1_ptr, + weight_type, + lora_config.rank, + output_grad_ptr, + output_type, + out_dim, + &beta, + weight.low_rank_activation, + lr_actv_type, + lora_config.rank, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + + // Compute LORA_A weight's gradient + if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) { + DT alpha = 1.0f; + DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero) + ? 0.0f + : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_T, + in_dim, + lora_config.rank, + num_peft_tokens, + &alpha, + weight.input_activation, + input_type, + in_dim, + weight.low_rank_activation, + lr_actv_type, + lora_config.rank, + &beta, + weight.w0_grad_ptr, + weight_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + // Compute input gradient + // NOTE: we use beta=1 for input_grad to accumulate gradients when needed + if (input_grad_ptr != nullptr) { + DT alpha = 1.0f; + DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f; + checkCUDA(hipblasGemmEx(m->handle.blas, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + in_dim, + num_peft_tokens, + lora_config.rank, + &alpha, + weight.w0_ptr, + weight_type, + in_dim, + weight.low_rank_activation, + lr_actv_type, + lora_config.rank, + &beta, + input_grad_ptr, + input_type, + in_dim, + compute_type, + HIPBLAS_GEMM_DEFAULT)); + } + if (bc->requestsInfo[i].optimizer_tasks.update_weights) { assert(lora_config.optimizer_config != nullptr); int w0_num_elements = lora_config.rank * in_dim; diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp index 8e262a78c..d5a0c5c9d 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cpp +++ b/src/ops/kernels/residual_rms_norm_kernels.cpp @@ -39,14 +39,24 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, DataType data_type = rms->weights[0]->data_type; size_t rms_ptr_size = batch_size; size_t norm_ptr_size = num_elements; - size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type); + size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree; + allocated_peft_buffer_size = + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; + size_t totalSize = + (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) + + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( reserveInst, totalSize, "ResidualRMSNormMeta"); rms_ptr = gpu_mem_allocator.allocate_instance_untyped( rms_ptr_size * data_type_size(data_type)); norm_ptr = gpu_mem_allocator.allocate_instance_untyped( norm_ptr_size * data_type_size(data_type)); - allocated_peft_buffer_size = 0; + if (enable_peft_finetuning) { + input_activation = + gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size); + } } ResidualRMSNormMeta::~ResidualRMSNormMeta(void) { if (reserveInst != Realm::RegionInstance::NO_INST) { @@ -206,6 +216,35 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m, } } +template +void store_peft_activations(ResidualRMSNormMeta const *m, + BatchConfig const *bc, + size_t in_dim, + DT *residual_output_ptr, + hipStream_t stream) { + assert(m->enable_peft_finetuning); + assert(bc->num_finetuning_fwd_tokens() >= 1); + + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); + int i = bc->finetuning_request_index(); + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; + int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch; + assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); + + size_t batch_offset = in_dim * tokens_previous_requests; + size_t request_offset = in_dim * tokens_previous_steps; + size_t data_size = in_dim * num_ft_tokens * sizeof(DT); + assert(m->allocated_peft_buffer_size >= data_size); + + checkCUDA( + hipMemcpyAsync(static_cast
(m->input_activation) + request_offset, + residual_output_ptr + batch_offset, + data_size, + hipMemcpyDeviceToDevice, + stream)); +} + void inference_kernel_wrapper(ResidualRMSNormMeta *m, BatchConfig const *bc, GenericTensorAccessorR const &input1, @@ -226,6 +265,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, assert(output.data_type == input1.data_type); assert(weight.data_type == output.data_type); assert(residual_output.data_type == output.data_type); + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; if (output.data_type == DT_HALF) { forward_kernel(m, @@ -235,6 +275,10 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, residual_output.get_half_ptr(), output.get_half_ptr(), stream); + if (bc->num_finetuning_fwd_requests() > 0) { + store_peft_activations( + m, bc, in_dim, residual_output.get_half_ptr(), stream); + } } else if (output.data_type == DT_FLOAT) { forward_kernel(m, input1.get_float_ptr(), @@ -243,45 +287,14 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, residual_output.get_float_ptr(), output.get_float_ptr(), stream); + if (bc->num_finetuning_fwd_requests() > 0) { + store_peft_activations( + m, bc, in_dim, residual_output.get_float_ptr(), stream); + } } else { assert(false && "Unsupported data type"); } - // save input activation if needed for PEFT. This must be done after the - // forward kernel since that's where we add the residual - if (bc->num_finetuning_fwd_requests() > 0) { - // Check that we have at most one request that requires peft_bwd - assert(bc->num_finetuning_fwd_tokens() >= 1); - int i = bc->finetuning_request_index(); - assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); - assert(!bc->requestsInfo[i].finetuning_backward_phase); - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; - assert(m->allocated_peft_buffer_size == - data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); - int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); - int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; - // copy input activation - if (m->input_type[0] == DT_FLOAT) { - checkCUDA(hipMemcpyAsync( - m->input_activation, - added_output.get_float_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - hipMemcpyDeviceToDevice, - stream)); - } else if (m->input_type[0] == DT_HALF) { - checkCUDA(hipMemcpyAsync( - m->input_activation, - added_output.get_half_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, - hipMemcpyDeviceToDevice, - stream)); - } else { - assert(false && "unsupport datatype in layernorm"); - } - } - if (m->profiling) { checkCUDA(hipEventRecord(t_end, stream)); checkCUDA(hipEventSynchronize(t_end)); diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu index f952f0727..a576bad33 100644 --- a/src/ops/kernels/residual_rms_norm_kernels.cu +++ b/src/ops/kernels/residual_rms_norm_kernels.cu @@ -41,10 +41,9 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler, size_t norm_ptr_size = num_elements; size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) + allocated_peft_buffer_size; @@ -222,23 +221,25 @@ void store_peft_activations(ResidualRMSNormMeta const *m, cudaStream_t stream) { assert(m->enable_peft_finetuning); assert(bc->num_finetuning_fwd_tokens() >= 1); - + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); int i = bc->finetuning_request_index(); - int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch; + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch; assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); - + size_t batch_offset = in_dim * tokens_previous_requests; size_t request_offset = in_dim * tokens_previous_steps; size_t data_size = in_dim * num_ft_tokens * sizeof(DT); assert(m->allocated_peft_buffer_size >= data_size); - - checkCUDA(cudaMemcpyAsync(static_cast(m->input_activation) + request_offset, - residual_output_ptr + batch_offset, - data_size, - cudaMemcpyDeviceToDevice, - stream)); + + checkCUDA( + cudaMemcpyAsync(static_cast
(m->input_activation) + request_offset, + residual_output_ptr + batch_offset, + data_size, + cudaMemcpyDeviceToDevice, + stream)); } void inference_kernel_wrapper(ResidualRMSNormMeta *m, @@ -272,7 +273,8 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, output.get_half_ptr(), stream); if (bc->num_finetuning_fwd_requests() > 0) { - store_peft_activations(m, bc, in_dim, residual_output.get_half_ptr(), stream); + store_peft_activations( + m, bc, in_dim, residual_output.get_half_ptr(), stream); } } else if (output.data_type == DT_FLOAT) { forward_kernel(m, @@ -283,14 +285,13 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m, output.get_float_ptr(), stream); if (bc->num_finetuning_fwd_requests() > 0) { - store_peft_activations(m, bc, in_dim, residual_output.get_float_ptr(), stream); + store_peft_activations( + m, bc, in_dim, residual_output.get_float_ptr(), stream); } } else { assert(false && "Unsupported data type"); } - - if (m->profiling) { cudaEventRecord(t_end, stream); checkCUDA(cudaEventSynchronize(t_end)); diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp index 1bc33b26d..503613923 100644 --- a/src/ops/kernels/rms_norm_kernels.cpp +++ b/src/ops/kernels/rms_norm_kernels.cpp @@ -22,6 +22,7 @@ namespace FlexFlow { // declare Legion names using Legion::coord_t; + #define C10_WARP_SIZE 32 RMSNormMeta::RMSNormMeta(FFHandler handler, @@ -32,6 +33,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, in_dim = rms->data_dim; batch_size = rms->effective_batch_size; + enable_peft_finetuning = rms->enable_peft_finetuning; num_elements = in_dim * batch_size; DataType data_type = rms->weights[0]->data_type; @@ -39,10 +41,9 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, size_t norm_ptr_size = num_elements; size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) + allocated_peft_buffer_size; @@ -222,25 +223,39 @@ void inference_kernel_wrapper(RMSNormMeta *m, assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); assert(!bc->requestsInfo[i].finetuning_backward_phase); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + if (m->allocated_peft_buffer_size != + data_type_size(m->input_type[0]) * BatchConfig::max_sequence_length() * + in_dim) { + std::cout << "allocated_peft_buffer_size = " + << m->allocated_peft_buffer_size << ", expected = " + << data_type_size(m->input_type[0]) * + BatchConfig::max_sequence_length() * in_dim + << std::endl; + std::cout << "in_dim = " << in_dim << std::endl; + std::cout << "max_sequence_length = " + << BatchConfig::max_sequence_length() << std::endl; + std::cout << "data_type_size = " << data_type_size(m->input_type[0]) + << std::endl; + } assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; // copy input activation - if (m->input_type[0] == DT_FLOAT) { + if (input.data_type == DT_FLOAT) { checkCUDA(hipMemcpyAsync( m->input_activation, - added_output.get_float_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + input.get_float_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, hipMemcpyDeviceToDevice, stream)); - } else if (m->input_type[0] == DT_HALF) { + } else if (input.data_type == DT_HALF) { checkCUDA(hipMemcpyAsync( m->input_activation, - added_output.get_half_ptr() + first_token_offset * in_dim, - data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, + input.get_half_ptr() + first_token_offset * in_dim, + data_type_size(input.data_type) * num_peft_tokens * in_dim, hipMemcpyDeviceToDevice, stream)); } else { diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu index 4fba94945..928770616 100644 --- a/src/ops/kernels/rms_norm_kernels.cu +++ b/src/ops/kernels/rms_norm_kernels.cu @@ -41,10 +41,9 @@ RMSNormMeta::RMSNormMeta(FFHandler handler, size_t norm_ptr_size = num_elements; size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) + allocated_peft_buffer_size; @@ -222,22 +221,22 @@ void inference_kernel_wrapper(RMSNormMeta *m, assert(!bc->requestsInfo[i].finetuning_backward_phase); int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; if (m->allocated_peft_buffer_size != - data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim) { + data_type_size(m->input_type[0]) * BatchConfig::max_sequence_length() * + in_dim) { std::cout << "allocated_peft_buffer_size = " << m->allocated_peft_buffer_size << ", expected = " << data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim + BatchConfig::max_sequence_length() * in_dim << std::endl; std::cout << "in_dim = " << in_dim << std::endl; std::cout << "max_sequence_length = " - << BatchConfig::max_finetuning_sequence_length() << std::endl; + << BatchConfig::max_sequence_length() << std::endl; std::cout << "data_type_size = " << data_type_size(m->input_type[0]) << std::endl; } assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp index 574547858..bf0d58a05 100644 --- a/src/ops/kernels/softmax.cpp +++ b/src/ops/kernels/softmax.cpp @@ -24,7 +24,9 @@ using Legion::Domain; SoftmaxMeta::SoftmaxMeta(FFHandler handler, Softmax const *softmax, - Domain const &input_domain) + Domain const &input_domain, + bool is_last_op, + MemoryAllocator &gpu_mem_allocator) : OpMeta(handler, softmax) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax( @@ -36,6 +38,17 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler, profiling = softmax->profiling; inference_debugging = softmax->inference_debugging; enable_peft_finetuning = softmax->enable_peft_finetuning; + if (enable_peft_finetuning && is_last_op) { + allocated_peft_buffer_size = + input_domain.get_volume() * data_type_size(softmax->data_type); + gpu_mem_allocator.create_legion_instance( + reserveInst, allocated_peft_buffer_size, "SoftmaxMeta"); + output_grad_ptr = + gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size); + } else { + allocated_peft_buffer_size = 0; + output_grad_ptr = nullptr; + } std::strcpy(op_name, softmax->name); } @@ -120,12 +133,11 @@ void backward_kernel_wrapper(SoftmaxMeta const *m, } } -void inference_kernel_wrapper(SoftmaxMeta const *m, +void inference_kernel_wrapper(SoftmaxMeta *m, BatchConfig const *bc, bool is_last_op, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorW const &output_grad) { + GenericTensorAccessorW const &output) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; @@ -142,12 +154,9 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_float_ptr(), num_classes, stream); - if (is_last_op) { - checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(), - output.get_float_ptr(), - output.domain.get_volume() * sizeof(float), - hipMemcpyDeviceToDevice, - stream)); + if (is_last_op && bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, num_classes, output.get_float_ptr(), stream); } } else if (m->output_type[0] == DT_HALF) { Internal::inference_kernel(m, @@ -156,12 +165,9 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, output.get_half_ptr(), num_classes, stream); - if (is_last_op) { - checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(), - output.get_half_ptr(), - output.domain.get_volume() * sizeof(half), - hipMemcpyDeviceToDevice, - stream)); + if (is_last_op && bc->num_finetuning_fwd_requests() > 0) { + Internal::store_peft_activations( + m, bc, num_classes, output.get_half_ptr(), stream); } } else { assert(false && "Unsupported data type"); @@ -183,8 +189,7 @@ void inference_kernel_wrapper(SoftmaxMeta const *m, void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, BatchConfig const *bc, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output_grad) { + GenericTensorAccessorW const &input_grad) { hipStream_t stream; checkCUDA(get_legion_stream(&stream)); hipEvent_t t_start, t_end; @@ -194,21 +199,13 @@ void peft_bwd_kernel_wrapper(SoftmaxMeta const *m, checkCUDA(hipEventRecord(t_start, stream)); } - int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1; + int num_classes = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; if (m->output_type[0] == DT_FLOAT) { - Internal::peft_bwd_kernel(m, - bc, - input_grad.get_float_ptr(), - output_grad.get_float_ptr(), - num_classes, - stream); + Internal::peft_bwd_kernel( + m, bc, input_grad.get_float_ptr(), num_classes, stream); } else if (m->output_type[0] == DT_HALF) { - Internal::peft_bwd_kernel(m, - bc, - input_grad.get_half_ptr(), - output_grad.get_half_ptr(), - num_classes, - stream); + Internal::peft_bwd_kernel( + m, bc, input_grad.get_half_ptr(), num_classes, stream); } else { assert(false && "Unsupported data type"); } @@ -288,6 +285,39 @@ void inference_kernel(SoftmaxMeta const *m, MIOPEN_SOFTMAX_MODE_CHANNEL)); } +template +void store_peft_activations(SoftmaxMeta *m, + BatchConfig const *bc, + int num_classes, + DT *output_ptr, + hipStream_t stream) { + assert(m->enable_peft_finetuning); + assert(m->output_grad_ptr != nullptr); + + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); + int i = bc->finetuning_request_index(); + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; + int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request; + assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); + + // shift labels by 1 position to the left (ignore first token label) + for (int j = 0; j < num_ft_tokens - 1; j++) { + m->peft_token_ids[prev_steps_tokens + j] = + bc->tokensInfo[tokens_previous_requests + j + 1].token_id; + } + + size_t batch_offset = num_classes * tokens_previous_requests; + size_t req_offset = num_classes * prev_steps_tokens; + size_t data_size = num_classes * num_ft_tokens * sizeof(DT); + assert(m->allocated_peft_buffer_size >= data_size); + checkCUDA(hipMemcpyAsync(static_cast
(m->output_grad_ptr) + req_offset, + output_ptr + batch_offset, + data_size, + hipMemcpyDeviceToDevice, + stream)); +} + template __global__ void sparse_categorical_crossentropy_loss_peft_backward( DT *input_grad, @@ -309,32 +339,22 @@ template void peft_bwd_kernel(SoftmaxMeta const *m, BatchConfig const *bc, DT *input_grad_ptr, - DT const *output_grad_ptr, int num_classes, hipStream_t stream) { - BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS]; assert( bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)); int i = bc->finetuning_request_index(); - int tokens_previous_requests = - bc->requestsInfo[i].first_token_offset_in_batch; + int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; - // shift labels by 1 position to the left (ignore first token label) - for (int j = 0; j < num_bwd_tokens; j++) { - token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id; - } - DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1); + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch); // ignore last token - checkCUDA(hipMemsetAsync(input_grad_ptr + - (tokens_previous_requests + - bc->requestsInfo[i].num_tokens_in_batch - 1) * - num_classes, + checkCUDA(hipMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes, 0, num_classes * sizeof(DT), stream)); checkCUDA(hipMemcpyAsync(m->handle.workSpace, - token_ids, + m->peft_token_ids, sizeof(BatchConfig::TokenId) * num_bwd_tokens, hipMemcpyHostToDevice, stream)); @@ -344,8 +364,8 @@ void peft_bwd_kernel(SoftmaxMeta const *m, CUDA_NUM_THREADS, 0, stream, - input_grad_ptr + tokens_previous_requests * num_classes, - output_grad_ptr + tokens_previous_requests * num_classes, + input_grad_ptr, + static_cast
(m->output_grad_ptr), static_cast(m->handle.workSpace), num_bwd_tokens, num_classes); @@ -355,7 +375,7 @@ void peft_bwd_kernel(SoftmaxMeta const *m, CUDA_NUM_THREADS, 0, stream, - input_grad_ptr + tokens_previous_requests * num_classes, + input_grad_ptr, num_bwd_tokens * num_classes, DT(0.0), scale_factor); diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu index 540e15c08..3a7864baf 100644 --- a/src/ops/kernels/softmax.cu +++ b/src/ops/kernels/softmax.cu @@ -154,7 +154,8 @@ void inference_kernel_wrapper(SoftmaxMeta *m, num_classes, stream); if (is_last_op && bc->num_finetuning_fwd_requests() > 0) { - Internal::store_peft_activations(m, bc, num_classes, output.get_float_ptr(), stream); + Internal::store_peft_activations( + m, bc, num_classes, output.get_float_ptr(), stream); } } else if (m->output_type[0] == DT_HALF) { Internal::inference_kernel(m, @@ -164,7 +165,8 @@ void inference_kernel_wrapper(SoftmaxMeta *m, num_classes, stream); if (is_last_op && bc->num_finetuning_fwd_requests() > 0) { - Internal::store_peft_activations(m, bc, num_classes, output.get_half_ptr(), stream); + Internal::store_peft_activations( + m, bc, num_classes, output.get_half_ptr(), stream); } } else { assert(false && "Unsupported data type"); @@ -285,29 +287,31 @@ void inference_kernel(SoftmaxMeta const *m, template void store_peft_activations(SoftmaxMeta *m, - BatchConfig const *bc, - int num_classes, - DT *output_ptr, - cudaStream_t stream) { + BatchConfig const *bc, + int num_classes, + DT *output_ptr, + cudaStream_t stream) { assert(m->enable_peft_finetuning); assert(m->output_grad_ptr != nullptr); - + int num_ft_tokens = bc->num_finetuning_fwd_tokens(); int i = bc->finetuning_request_index(); - int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch; + int tokens_previous_requests = + bc->requestsInfo[i].first_token_offset_in_batch; int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request; assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens); // shift labels by 1 position to the left (ignore first token label) - for (int j = 0; j < num_ft_tokens-1; j++) { - m->peft_token_ids[j] = bc->tokensInfo[tokens_previous_requests + prev_steps_tokens + j + 1].token_id; + for (int j = 0; j < num_ft_tokens - 1; j++) { + m->peft_token_ids[prev_steps_tokens + j] = + bc->tokensInfo[tokens_previous_requests + j + 1].token_id; } size_t batch_offset = num_classes * tokens_previous_requests; size_t req_offset = num_classes * prev_steps_tokens; size_t data_size = num_classes * num_ft_tokens * sizeof(DT); assert(m->allocated_peft_buffer_size >= data_size); - checkCUDA(cudaMemcpyAsync(static_cast(m->output_grad_ptr) + req_offset, + checkCUDA(cudaMemcpyAsync(static_cast
(m->output_grad_ptr) + req_offset, output_ptr + batch_offset, data_size, cudaMemcpyDeviceToDevice, @@ -343,9 +347,9 @@ void peft_bwd_kernel(SoftmaxMeta const *m, int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1; - DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch ); + DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch); // ignore last token - checkCUDA(cudaMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes, + checkCUDA(cudaMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes, 0, num_classes * sizeof(DT), stream)); @@ -367,10 +371,8 @@ void peft_bwd_kernel(SoftmaxMeta const *m, scale_kernel<<>>(input_grad_ptr, - num_bwd_tokens * num_classes, - DT(0.0), - scale_factor); + stream>>>( + input_grad_ptr, num_bwd_tokens * num_classes, DT(0.0), scale_factor); } } // namespace Internal diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp index 55569ea06..207fd382d 100644 --- a/src/ops/layer_norm.cpp +++ b/src/ops/layer_norm.cpp @@ -40,10 +40,9 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 6 + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -252,7 +251,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; @@ -260,14 +259,14 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, if (m->input_type[0] == DT_FLOAT) { checkCUDA(hipMemcpyAsync( m->input_activation, - added_output.get_float_ptr() + first_token_offset * in_dim, + input.get_float_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, hipMemcpyDeviceToDevice, stream)); } else if (m->input_type[0] == DT_HALF) { checkCUDA(hipMemcpyAsync( m->input_activation, - added_output.get_half_ptr() + first_token_offset * in_dim, + input.get_half_ptr() + first_token_offset * in_dim, data_type_size(m->input_type[0]) * num_peft_tokens * in_dim, hipMemcpyDeviceToDevice, stream)); diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu index 60c37ea7a..fecaa067c 100644 --- a/src/ops/layer_norm.cu +++ b/src/ops/layer_norm.cu @@ -39,10 +39,9 @@ LayerNormMeta::LayerNormMeta(FFHandler handle, DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 6 + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -251,7 +250,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m, int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; diff --git a/src/ops/linear.cc b/src/ops/linear.cc index 99e4f100f..7afb7cff8 100644 --- a/src/ops/linear.cc +++ b/src/ops/linear.cc @@ -737,8 +737,6 @@ bool Linear::peft_bwd_task(Task const *task, ctx, task->regions[0].region.get_index_space()); LinearMeta *m = *((LinearMeta **)task->local_args); BatchConfig const *bc = BatchConfig::from_future(task->futures[0]); - // std::string op_name_without_uid = get_op_name_without_uid(m); - // std::cout << "Linear PEFT BWD " << op_name_without_uid << std::endl; if (!bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)) { return false; } @@ -772,13 +770,8 @@ bool Linear::peft_bwd_task(Task const *task, in_dim, num_peft_tokens); } - peft_bwd_kernel_wrapper(m, - bc, - input_grad.ptr, - output_grad.ptr, - weight.ptr, - in_dim, - out_dim); + peft_bwd_kernel_wrapper( + m, bc, input_grad.ptr, output_grad.ptr, weight.ptr, in_dim, out_dim); if (m->inference_debugging) { assert(task->index_point.get_dim() == 1); int shard_id = task->index_point.point_data[0]; diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp index 606045daf..001b3f88b 100644 --- a/src/ops/residual_layer_norm.cpp +++ b/src/ops/residual_layer_norm.cpp @@ -42,10 +42,9 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -279,7 +278,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu index ff61f6f79..64edcb853 100644 --- a/src/ops/residual_layer_norm.cu +++ b/src/ops/residual_layer_norm.cu @@ -41,10 +41,9 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle, DataType data_type = ln->data_type; size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; allocated_peft_buffer_size = - enable_peft_finetuning - ? (data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * in_dim) - : 0; + enable_peft_finetuning ? (data_type_size(data_type) * + BatchConfig::max_sequence_length() * in_dim) + : 0; size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 + allocated_peft_buffer_size; gpu_mem_allocator.create_legion_instance( @@ -277,7 +276,7 @@ void ResidualLayerNorm::inference_kernel_wrapper( int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; assert(m->allocated_peft_buffer_size == data_type_size(m->input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * in_dim); + BatchConfig::max_sequence_length() * in_dim); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp index 2e3f92a95..77c673f07 100644 --- a/src/ops/sigmoid_silu_multi.cpp +++ b/src/ops/sigmoid_silu_multi.cpp @@ -28,10 +28,10 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, inference_debugging = ssm->inference_debugging; enable_peft_finetuning = ssm->enable_peft_finetuning; if (enable_peft_finetuning) { - size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree; - allocated_peft_buffer_size = 2 * data_type_size(data_type) * - BatchConfig::max_finetuning_sequence_length() * - in_dim; + size_t in_dim = + ssm->inputs[0]->dims[0].size / ssm->inputs[0]->dims[0].degree; + allocated_peft_buffer_size = 2 * data_type_size(input_type[0]) * + BatchConfig::max_sequence_length() * in_dim; gpu_mem_allocator.create_legion_instance( reserveInst, allocated_peft_buffer_size, "SigmoidSiluMultiMeta"); input_activation = @@ -119,13 +119,15 @@ void SigmoidSiluMulti::inference_kernel_wrapper( int i = bc->finetuning_request_index(); assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID); assert(!bc->requestsInfo[i].finetuning_backward_phase); - int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1; + int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); + int max_peft_tokens = BatchConfig::max_sequence_length(); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; size_t input_tensor_size = data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; - assert(m->allocated_peft_buffer_size == 2 * input_tensor_size); + assert(m->allocated_peft_buffer_size == + 2 * (data_type_size(m->input_type[0]) * max_peft_tokens * in_dim)); // copy input activation if (m->input_type[0] == DT_FLOAT) { checkCUDA( diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index 14df260ea..22c4d79cc 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -30,8 +30,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle, size_t in_dim = ssm->inputs[0]->dims[0].size / ssm->inputs[0]->dims[0].degree; allocated_peft_buffer_size = 2 * data_type_size(input_type[0]) * - BatchConfig::max_finetuning_sequence_length() * - in_dim; + BatchConfig::max_sequence_length() * in_dim; gpu_mem_allocator.create_legion_instance( reserveInst, allocated_peft_buffer_size, "SigmoidSiluMultiMeta"); input_activation = @@ -122,7 +121,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1; int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; assert(num_peft_tokens == bc->num_finetuning_fwd_tokens()); - int max_peft_tokens = BatchConfig::max_finetuning_sequence_length(); + int max_peft_tokens = BatchConfig::max_sequence_length(); int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; size_t input_tensor_size = data_type_size(m->input_type[0]) * num_peft_tokens * in_dim; diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc index fc8a0a96b..808cbf720 100644 --- a/src/ops/softmax.cc +++ b/src/ops/softmax.cc @@ -662,4 +662,4 @@ size_t hash::operator()( hash_combine(key, params.dim); return key; } -}; // namespace std +}; // namespace std \ No newline at end of file diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp index da52ccea7..851fbdcca 100644 --- a/src/ops/spec_inc_multihead_self_attention.cpp +++ b/src/ops/spec_inc_multihead_self_attention.cpp @@ -23,12 +23,11 @@ namespace FlexFlow { +#define WARP_SIZE 32 + // declare Legion names using Legion::coord_t; using Legion::Memory; - -#define WARP_SIZE 32 - using namespace Kernels::IncMultiHeadAttention; namespace Kernels { @@ -576,6 +575,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m, compute_type, HIPBLAS_GEMM_DEFAULT)); + // add alibi position bias to qk production if (*m->position_bias) { size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens; hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd
), @@ -710,12 +710,12 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); - hipMemcpyAsync(m->devQKVProjArray, - qkv_ptr, - qkv_proj_size * - sizeof(DT), // is this right, do we need layers etc here - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync( + m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + hipMemcpyDeviceToDevice, + stream)); // phase 1: Implement kernel to compute KQV for input tokens // TODO WARNING: this is commented out only because we are fixing the inc_attn // first @@ -735,11 +735,11 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m, int num_tokens = bc->num_active_tokens(); - hipMemcpyAsync(output_ptr, - m->attn_heads, - m->oProjSize * num_tokens * sizeof(DT), - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync(output_ptr, + m->attn_heads, + m->oProjSize * num_tokens * sizeof(DT), + hipMemcpyDeviceToDevice, + stream)); } } // namespace SpecIncMultiHeadSelfAttention diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp index 6e24326dd..2bfa88bdc 100644 --- a/src/ops/tree_inc_multihead_self_attention.cpp +++ b/src/ops/tree_inc_multihead_self_attention.cpp @@ -622,12 +622,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m, size_t qkv_proj_size = m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens(); - hipMemcpyAsync(m->devQKVProjArray, - qkv_ptr, - qkv_proj_size * - sizeof(DT), // is this right, do we need layers etc here - hipMemcpyDeviceToDevice, - stream); + checkCUDA(hipMemcpyAsync( + m->devQKVProjArray, + qkv_ptr, + qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here + hipMemcpyDeviceToDevice, + stream)); // phase 1: Implement kernel to compute KQV for input tokens // TODO WARNING: this is commented out only because we are fixing the inc_attn diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc index 8d3606455..e7c72d93a 100644 --- a/src/parallel_ops/allreduce.cc +++ b/src/parallel_ops/allreduce.cc @@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce; /* Params */ bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) { - return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim && + return lhs.layer_guid == rhs.layer_guid && + lhs.allreduce_legion_dim == rhs.allreduce_legion_dim && std::strcmp(lhs.name, rhs.name) == 0; } @@ -55,6 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const { AllReduceParams AllReduce::get_params() const { AllReduceParams params; + params.layer_guid = this->layer_guid; params.allreduce_legion_dim = this->allreduce_dim; if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); @@ -63,10 +65,11 @@ AllReduceParams AllReduce::get_params() const { } AllReduce::AllReduce(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor _input, int _allreduce_legion_dim, char const *name) - : ParallelOp(model, OP_ALLREDUCE, name, _input), + : ParallelOp(model, OP_ALLREDUCE, name, _input), layer_guid(_layer_guid), allreduce_dim(_allreduce_legion_dim) { int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -83,7 +86,11 @@ AllReduce::AllReduce(FFModel &model, AllReduceParams const ¶ms, ParallelTensor const input, char const *name) - : AllReduce(model, input, params.allreduce_legion_dim, params.name) {} + : AllReduce(model, + params.layer_guid, + input, + params.allreduce_legion_dim, + params.name) {} void AllReduce::create_input_partition(FFModel &ff) { // Do nothing @@ -112,6 +119,7 @@ OpMeta *AllReduce::init_task(Task const *task, meta->output_type[0] = ar->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); std::strcpy(meta->op_name, ar->name); + meta->layer_guid = ar->layer_guid; return meta; } diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu index 3041f9adf..6201962d9 100644 --- a/src/parallel_ops/kernels/allreduce_kernels.cu +++ b/src/parallel_ops/kernels/allreduce_kernels.cu @@ -86,7 +86,7 @@ void peft_bwd_kernel_wrapper(AllReduceMeta const *m, assert(input_grad.domain == output_grad.domain); size_t hidden_dim_size = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; - size_t num_elements = bc->num_active_tokens(); + size_t num_elements = bc->num_finetuning_bwd_tokens(); size_t data_size = data_type_size(output_grad.data_type); checkCUDA(cudaMemcpyAsync(input_grad.ptr, output_grad.ptr, diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu index 6800f3ab1..2099347fa 100644 --- a/src/parallel_ops/kernels/parallel_identity_kernels.cu +++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu @@ -76,7 +76,7 @@ void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m, assert(input_grad.domain == output_grad.domain); size_t hidden_dim_size = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1; - size_t num_elements = bc->num_active_tokens() * hidden_dim_size; + size_t num_elements = bc->num_finetuning_bwd_tokens() * hidden_dim_size; #ifdef FF_USE_NCCL ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type); checkNCCL(ncclAllReduce(output_grad.ptr, diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc index db0265726..723856c81 100644 --- a/src/parallel_ops/parallel_identity.cc +++ b/src/parallel_ops/parallel_identity.cc @@ -46,7 +46,8 @@ using namespace FlexFlow::Kernels::ParallelIdentity; /* Params */ bool operator==(ParallelIdentityParams const &lhs, ParallelIdentityParams const &rhs) { - return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim && + return lhs.layer_guid == rhs.layer_guid && + lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim && std::strcmp(lhs.name, rhs.name) == 0; } @@ -56,6 +57,7 @@ bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const { ParallelIdentityParams ParallelIdentity::get_params() const { ParallelIdentityParams params; + params.layer_guid = this->layer_guid; params.parallel_identity_legion_dim = this->parallel_identity_dim; if (strlen(this->name) < MAX_OPNAME) { strcpy(params.name, this->name); @@ -64,10 +66,12 @@ ParallelIdentityParams ParallelIdentity::get_params() const { } ParallelIdentity::ParallelIdentity(FFModel &model, + LayerID const &_layer_guid, const ParallelTensor _input, int _parallel_identity_legion_dim, char const *name) : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input), + layer_guid(_layer_guid), parallel_identity_dim(_parallel_identity_legion_dim) { int numdim = _input->num_dims; ParallelDim dims[MAX_TENSOR_DIM]; @@ -84,8 +88,11 @@ ParallelIdentity::ParallelIdentity(FFModel &model, ParallelIdentityParams const ¶ms, ParallelTensor const input, char const *name) - : ParallelIdentity( - model, input, params.parallel_identity_legion_dim, params.name) {} + : ParallelIdentity(model, + params.layer_guid, + input, + params.parallel_identity_legion_dim, + params.name) {} void ParallelIdentity::create_input_partition(FFModel &ff) { // Do nothing @@ -114,6 +121,7 @@ OpMeta *ParallelIdentity::init_task(Task const *task, meta->output_type[0] = ar->outputs[0]->data_type; assert(meta->input_type[0] == meta->output_type[0]); std::strcpy(meta->op_name, ar->name); + meta->layer_guid = ar->layer_guid; return meta; } diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc index f8c4982c9..20fed4955 100644 --- a/src/runtime/batch_config.cc +++ b/src/runtime/batch_config.cc @@ -304,4 +304,4 @@ void BatchConfig::save_to_file(std::string const &filename) const { } } -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc index 2bc64c167..1f086cc1a 100644 --- a/src/runtime/graph.cc +++ b/src/runtime/graph.cc @@ -2453,6 +2453,9 @@ GraphOptimalViewSerialized } case OP_ALLREDUCE: { AllReduce *allreduce = (AllReduce *)op; + sez.serialize(allreduce->layer_guid.id); + sez.serialize(allreduce->layer_guid.transformer_layer_id); + sez.serialize(allreduce->layer_guid.model_id); sez.serialize(allreduce->allreduce_dim); sez.serialize(strlen(allreduce->name)); sez.serialize(allreduce->name, strlen(allreduce->name)); @@ -2460,6 +2463,9 @@ GraphOptimalViewSerialized } case OP_PARALLEL_IDENTITY: { ParallelIdentity *parallel_identity = (ParallelIdentity *)op; + sez.serialize(parallel_identity->layer_guid.id); + sez.serialize(parallel_identity->layer_guid.transformer_layer_id); + sez.serialize(parallel_identity->layer_guid.model_id); sez.serialize(parallel_identity->parallel_identity_dim); sez.serialize(strlen(parallel_identity->name)); sez.serialize(parallel_identity->name, strlen(parallel_identity->name)); @@ -3154,6 +3160,11 @@ void FFModel::deserialize_graph_optimal_view( break; } case OP_ALLREDUCE: { + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); assert(num_inputs == 1); int allreduce_dim; dez.deserialize(allreduce_dim); @@ -3162,12 +3173,18 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(name_len); dez.deserialize(name, name_len); AllReduceParams params; + params.layer_guid = layer_guid; params.allreduce_legion_dim = allreduce_dim; strcpy(params.name, name); node = get_or_create_node(inputs[0], params); break; } case OP_PARALLEL_IDENTITY: { + size_t id, transformer_layer_id, deserialized_model_id; + dez.deserialize(id); + dez.deserialize(transformer_layer_id); + dez.deserialize(deserialized_model_id); + LayerID layer_guid(id, transformer_layer_id, deserialized_model_id); assert(num_inputs == 1); int parallel_identity_dim; dez.deserialize(parallel_identity_dim); @@ -3176,6 +3193,7 @@ void FFModel::deserialize_graph_optimal_view( dez.deserialize(name_len); dez.deserialize(name, name_len); ParallelIdentityParams params; + params.layer_guid = layer_guid; params.parallel_identity_legion_dim = parallel_identity_dim; strcpy(params.name, name); node = get_or_create_node(inputs[0], params); diff --git a/src/runtime/model.cc b/src/runtime/model.cc index 5b1f2d19a..c3256374f 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3513,7 +3513,11 @@ void FFModel::create_operators_from_layers() { std::to_string( transformer_layer_allreduce_count[transformer_layer_id])); transformer_layer_allreduce_count[transformer_layer_id]++; + LayerID ar_guid = LayerID(this->layer_global_guid++, + op->layer_guid.transformer_layer_id, + this->model_id); AllReduce *allreduce = new AllReduce(*this, + ar_guid, op->outputs[0], op->outputs[0]->num_dims - 1, allreduce_name.c_str()); @@ -3542,15 +3546,20 @@ void FFModel::create_operators_from_layers() { transformer_layer_parallel_identity_count[transformer_layer_id])); transformer_layer_parallel_identity_count[transformer_layer_id]++; ParallelIdentity *parallel_identity = nullptr; + LayerID pi_guid = LayerID(this->layer_global_guid++, + op->layer_guid.transformer_layer_id, + this->model_id); if (op->numOutputs == 1) { parallel_identity = new ParallelIdentity(*this, + pi_guid, op->outputs[0], op->outputs[0]->num_dims - 1, parallel_identity_name.c_str()); } else if (op->numOutputs == 2) { parallel_identity = new ParallelIdentity(*this, + pi_guid, op->outputs[1], op->outputs[1]->num_dims - 1, parallel_identity_name.c_str()); @@ -4685,25 +4694,6 @@ void register_flexflow_internal_tasks(Runtime *runtime, registrar); } } - // RequestManager process_work_from_old_batches_task - // { - // TaskVariantRegistrar registrar( - // RM_PROCESS_WORK_FROM_OLD_BATCHES_TASK_ID, - // "RequestManager Process Work from Old Batches"); - // registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC)); - // registrar.set_leaf(); - // if (pre_register) { - // Runtime::preregister_task_variant( - // registrar, "RequestManager Process Work from Old Batches Task"); - // } else { - // if (enable_control_replication) { - // registrar.global_registration = false; - // } - // runtime->register_task_variant(registrar); - // } - // } // RequestManager prepare_next_batch_task { TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID, @@ -7839,4 +7829,4 @@ LEGION_FOREACH_NN(DIMFUNC) LEGION_FOREACH_NN(DIMFUNC) #undef DIMFUNC -}; // namespace FlexFlow +}; // namespace FlexFlow \ No newline at end of file diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc index 300a2385c..efb72f331 100644 --- a/src/runtime/peft_weight_allocator.cc +++ b/src/runtime/peft_weight_allocator.cc @@ -338,4 +338,4 @@ void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) { assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!"); } -}; // namespace FlexFlow \ No newline at end of file +}; // namespace FlexFlow diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 1990f1d11..fc2b45da7 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -284,7 +284,6 @@ RequestManager::RequestManager() max_tokens_per_batch = -1; max_spec_tree_token_num = -1; max_sequence_length = -1; - max_finetuning_sequence_length = -1; step_idx = 0; run_idx = 0; } @@ -1251,7 +1250,8 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) { new_bc.num_active_tokens(); new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = num_peft_tokens; - new_bc.requestsInfo[inference_batch_size].max_length = request.dataset[dataset_entry].size(); + new_bc.requestsInfo[inference_batch_size].max_length = + request.dataset[dataset_entry].size(); new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; @@ -1312,7 +1312,8 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) { new_bc.num_active_tokens(); new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch = request.dataset[dataset_entry].size(); - new_bc.requestsInfo[inference_batch_size].max_length = request.dataset[dataset_entry].size(); + new_bc.requestsInfo[inference_batch_size].max_length = + request.dataset[dataset_entry].size(); new_bc.requestsInfo[inference_batch_size].request_guid = request.guid; new_bc.requestsInfo[inference_batch_size].peft_model_id = request.peft_model_id; diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc index 0e28c02cd..96793abeb 100644 --- a/src/runtime/substitution.cc +++ b/src/runtime/substitution.cc @@ -3800,14 +3800,18 @@ bool FFModel::convert_graph_to_operators( case OP_ALLREDUCE: { assert(inList.size() == 1); AllReduce *allreduce = (AllReduce *)node.ptr; - new_op = new AllReduce( - *this, inputs[0], allreduce->allreduce_dim, allreduce->name); + new_op = new AllReduce(*this, + allreduce->layer_guid, + inputs[0], + allreduce->allreduce_dim, + allreduce->name); break; } case OP_PARALLEL_IDENTITY: { assert(inList.size() == 1); ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr; new_op = new ParallelIdentity(*this, + parallel_identity->layer_guid, inputs[0], parallel_identity->parallel_identity_dim, parallel_identity->name); diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py index 320966bf1..3a10fc443 100644 --- a/tests/peft/hf_utils.py +++ b/tests/peft/hf_utils.py @@ -3,10 +3,8 @@ import transformers from transformers import ( TrainerCallback, - AutoConfig, AutoModelForCausalLM, AutoTokenizer, - LlamaTokenizer, ) import os, shutil from peft import PeftConfig, PeftModel @@ -306,21 +304,10 @@ def build_peft_model(args, peft_config): def get_peft_tokenizer(args, peft_config): # Get Tokenizer - hf_config = AutoConfig.from_pretrained( - peft_config.base_model_name_or_path, trust_remote_code=True + tokenizer = AutoTokenizer.from_pretrained( + peft_config.base_model_name_or_path, + torch_dtype=torch.float32 if args.use_full_precision else torch.float16, ) - hf_arch = getattr(hf_config, "architectures")[0] - if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM": - tokenizer = LlamaTokenizer.from_pretrained( - peft_config.base_model_name_or_path, - use_fast=True, - torch_dtype=torch.float32 if args.use_full_precision else torch.float16, - ) - else: - tokenizer = AutoTokenizer.from_pretrained( - peft_config.base_model_name_or_path, - torch_dtype=torch.float32 if args.use_full_precision else torch.float16, - ) if tokenizer.pad_token is None: tokenizer.pad_token = "[PAD]" tokenizer.padding_side = "left" @@ -379,4 +366,4 @@ def save_finetuned_model(model, args): print( f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..." ) - model.push_to_hub(args.publish_peft_with_id, use_auth_token=True) + model.push_to_hub(args.publish_peft_with_id, use_auth_token=True) \ No newline at end of file