diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e23d9e5fb..97dfd7813 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,7 +12,7 @@ concurrency:
 jobs:
   cmake-build:
     name: Build FlexFlow Serve
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
@@ -68,14 +68,14 @@ jobs:
           ../config/config.linux
           make -j $n_build_cores 
           make install
-          # sudo make install
-          # sudo ldconfig
 
       - name: Check availability of flexflow modules in Python
         run: |
           if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
             export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
             sudo ln -s $CUDA_PATH/lib64/stubs/libcuda.so $CUDA_PATH/lib64/stubs/libcuda.so.1
+          else
+            sudo ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 $CONDA_PREFIX/lib/libstdc++.so.6
           fi
           # Remove build folder to check that the installed version can run independently of the build files
           rm -rf build
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 3af6ba664..3b2343079 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -3,7 +3,7 @@ on: [push, pull_request, workflow_dispatch]
 jobs:
   formatting-check:
     name: Formatting Check
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         path:
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 757689d22..ab50784b3 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -14,7 +14,7 @@ concurrency:
 jobs:
   docker-build-rocm:
     name: Build and Install FlexFlow in a Docker Container (ROCm backend)
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }}
     env:
       FF_GPU_BACKEND: "hip_rocm"
@@ -69,7 +69,7 @@ jobs:
   
   docker-build-cuda:
     name: Build and Install FlexFlow in a Docker Container (CUDA backend)
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         cuda_version: ["11.8", "12.0", "12.1", "12.2"]
@@ -119,7 +119,7 @@ jobs:
 
   notify-slack:
     name: Notify Slack in case of failure
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     needs: [docker-build-cuda, docker-build-and-publish-rocm]
     if: ${{ failure() && github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} 
     steps:
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 748ff5094..d57ff8334 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -129,6 +129,7 @@ jobs:
           CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
         run: |
           source ./build/set_python_envs.sh
+          ./tests/fine_grained_alignment_test.sh
           ./tests/inference_tests.sh
       
       - name: Run PEFT tests
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
index 73b8e8841..1e8038573 100755
--- a/.github/workflows/helpers/install_cudnn.sh
+++ b/.github/workflows/helpers/install_cudnn.sh
@@ -8,72 +8,11 @@ cd "${BASH_SOURCE[0]%/*}"
 ubuntu_version=$(lsb_release -rs)
 ubuntu_version=${ubuntu_version//./}
 
-# Install CUDNN
-cuda_version=${1:-12.1.1}
-cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
-echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
-CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-if [[ "$cuda_version" == "10.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "10.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-10.2-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-10.2-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.0" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.0-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.0-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.1" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.1-linux-x64-v8.0.5.39.tgz
-elif [[ "$cuda_version" == "11.2" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.1.1/cudnn-11.2-linux-x64-v8.1.1.33.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.2-linux-x64-v8.1.1.33.tgz
-elif [[ "$cuda_version" == "11.3" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.1/cudnn-11.3-linux-x64-v8.2.1.32.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.3-linux-x64-v8.2.1.32.tgz
-elif [[ "$cuda_version" == "11.4" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.2.4/cudnn-11.4-linux-x64-v8.2.4.15.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.4-linux-x64-v8.2.4.15.tgz
-elif [[ "$cuda_version" == "11.5" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.0/cudnn-11.5-linux-x64-v8.3.0.98.tgz
-    CUDNN_TARBALL_NAME=cudnn-11.5-linux-x64-v8.3.0.98.tgz
-elif [[ "$cuda_version" == "11.6" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.4.0/local_installers/11.6/cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.4.0.27_cuda11.6-archive.tar.xz
-elif [[ "$cuda_version" == "11.7" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.5.0/local_installers/11.7/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz
-elif [[ "$cuda_version" == "11.8" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-    CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
-    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
-    CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
-else
-    echo "CUDNN support for CUDA version above 12.5 not yet added"
-    exit 1
-fi
-wget -c -q $CUDNN_LINK
-if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" == "11.8" ]]; then
-    tar -xf $CUDNN_TARBALL_NAME -C ./
-    CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}"
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
-    sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
-    rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
-elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
-    wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
-    sudo dpkg -i cuda-keyring_1.1-1_all.deb
-    sudo apt update -y
-    rm -f cuda-keyring_1.1-1_all.deb
-    sudo dpkg -i $CUDNN_TARBALL_NAME
-    sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
-    sudo apt update -y
-    sudo apt install -y libcudnn8
-    sudo apt install -y libcudnn8-dev
-    sudo apt install -y libcudnn8-samples
-else
-    sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
-fi
-rm $CUDNN_TARBALL_NAME 
+wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update -y
+rm -f cuda-keyring_1.1-1_all.deb
+sudo apt-get -y install libcudnn9-cuda-12
+sudo apt-get -y install libcudnn9-dev-cuda-12
+sudo apt-get -y install libcudnn9-samples
 sudo ldconfig
diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh
index 9d07ea71f..ac52a80e1 100755
--- a/.github/workflows/helpers/install_dependencies.sh
+++ b/.github/workflows/helpers/install_dependencies.sh
@@ -40,7 +40,20 @@ if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
     elif [ "$hip_version" = "5.5" ]; then
         AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb
     fi
-    AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"
+    # Detect Ubuntu version
+    UBUNTU_VERSION=$(lsb_release -rs)
+    if [[ "$UBUNTU_VERSION" == "20.04" ]]; then
+        UBUNTU_CODENAME="focal"
+    elif [[ "$UBUNTU_VERSION" == "22.04" ]]; then
+        UBUNTU_CODENAME="jammy"
+    elif [[ "$UBUNTU_VERSION" == "24.04" ]]; then
+        UBUNTU_CODENAME="jammy"
+    else
+        echo "Unsupported Ubuntu version: $UBUNTU_VERSION"
+        exit 1
+    fi
+
+    AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/${UBUNTU_CODENAME}/${AMD_GPU_SCRIPT_NAME}"
     # Download and install AMD GPU software with ROCM and HIP support
     wget "$AMD_GPU_SCRIPT_URL"
     sudo apt-get install -y ./${AMD_GPU_SCRIPT_NAME}
@@ -48,20 +61,20 @@ if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
     sudo amdgpu-install -y --usecase=hip,rocm --no-dkms
     sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs
 
-    # Install protobuf v3.20.x manually
-    sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make
-    git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git
-    cd protobuf/
-    git submodule update --init --recursive
-    ./autogen.sh
-    ./configure
-    cores_available=$(nproc --all)
-    n_build_cores=$(( cores_available -1 ))
-    if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi
-    make -j $n_build_cores
-    sudo make install
-    sudo ldconfig
-    cd ..
+    # # Install protobuf v3.20.x manually
+    # sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make
+    # git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git
+    # cd protobuf/
+    # git submodule update --init --recursive
+    # ./autogen.sh
+    # ./configure
+    # cores_available=$(nproc --all)
+    # n_build_cores=$(( cores_available -1 ))
+    # if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi
+    # make -j $n_build_cores
+    # sudo make install
+    # sudo ldconfig
+    # cd ..
 else
     echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"
 fi
diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
index ae6793ea2..306e5d699 100755
--- a/.github/workflows/helpers/install_nccl.sh
+++ b/.github/workflows/helpers/install_nccl.sh
@@ -5,47 +5,10 @@ set -x
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
-# Add NCCL key ring
 ubuntu_version=$(lsb_release -rs)
 ubuntu_version=${ubuntu_version//./}
-wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt update -y
+sudo apt-get update -y --allow-change-held-packages
 rm -f cuda-keyring_1.1-1_all.deb
-
-# Install NCCL
-cuda_version=${1:-12.1.1}
-cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
-echo "Installing NCCL for CUDA version: ${cuda_version} ..."
-
-# We need to run a different install command based on the CUDA version, otherwise running `sudo apt install libnccl2 libnccl-dev`
-# will automatically upgrade CUDA to the latest version.
-
-if [[ "$cuda_version" == "11.0" ]]; then
-    sudo apt install libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0
-elif [[ "$cuda_version" == "11.1" ]]; then
-    sudo apt install libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1
-elif [[ "$cuda_version" == "11.2" ]]; then
-    sudo apt install libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2
-elif [[ "$cuda_version" == "11.3" ]]; then
-    sudo apt install libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3
-elif [[ "$cuda_version" == "11.4" ]]; then
-    sudo apt install libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4
-elif [[ "$cuda_version" == "11.5" ]]; then
-    sudo apt install libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5
-elif [[ "$cuda_version" == "11.6" ]]; then
-    sudo apt install libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6
-elif [[ "$cuda_version" == "11.7" ]]; then
-    sudo apt install libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7
-elif [[ "$cuda_version" == "11.8" ]]; then
-    sudo apt install libnccl2=2.16.5-1+cuda11.8 libnccl-dev=2.16.5-1+cuda11.8
-elif [[ "$cuda_version" == "12.0" ]]; then
-    sudo apt install libnccl2=2.18.3-1+cuda12.0 libnccl-dev=2.18.3-1+cuda12.0
-elif [[ "$cuda_version" == "12.1" ]]; then
-    sudo apt install libnccl2=2.18.3-1+cuda12.1 libnccl-dev=2.18.3-1+cuda12.1
-elif [[ "$cuda_version" == "12.2" ]]; then
-    sudo apt install libnccl2=2.18.3-1+cuda12.2 libnccl-dev=2.18.3-1+cuda12.2
-else
-    echo "Installing NCCL for CUDA version ${cuda_version} is not supported"
-    exit 1
-fi
+sudo apt install -y --allow-change-held-packages libnccl2 libnccl-dev
diff --git a/.github/workflows/pip-deploy.yml b/.github/workflows/pip-deploy.yml
index 5558e51e3..37ea63705 100644
--- a/.github/workflows/pip-deploy.yml
+++ b/.github/workflows/pip-deploy.yml
@@ -9,7 +9,7 @@ concurrency:
 jobs:
   build-n-publish:
     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     permissions:
       # IMPORTANT: this permission is mandatory for trusted publishing
       id-token: write
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index f348bfe5e..26a74f8db 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -13,7 +13,7 @@ concurrency:
 jobs:
   pip-install-flexflow:
     name: Install FlexFlow with pip
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     defaults:
       run:
         shell: bash -l {0} # required to use an activated conda environment
diff --git a/.github/workflows/shell-check.yml b/.github/workflows/shell-check.yml
index a825d63d9..f2e31429c 100644
--- a/.github/workflows/shell-check.yml
+++ b/.github/workflows/shell-check.yml
@@ -3,7 +3,7 @@ on: [push, pull_request, workflow_dispatch]
 jobs:
   shellcheck:
     name: Shellcheck
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
       - name: Run ShellCheck
diff --git a/.gitmodules b/.gitmodules
index 913b08886..1f49e93f0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "deps/legion"]
 	path = deps/legion
-	url = https://gitlab.com/goliaro/legion.git
+	url = https://github.com/flexflow/legion.git
 [submodule "deps/nccl"]
 	path = deps/nccl
 	url = https://github.com/NVIDIA/nccl.git
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index d457684d0..7568ed648 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -3,23 +3,19 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - python>=3.6,<3.12
-  - cffi>=1.11.0
-  - Pillow
+  - python
+  - cffi
   - rust
   - cmake-build-extension
   - jq
   - pytest
   - pip
   - pip:
-    - qualname>=0.1.0
-    - keras_preprocessing>=1.1.2
-    - numpy>=1.16.0
-    - torch>=1.13.1
-    - torchaudio>=0.13.1
-    - torchvision>=0.14.1
+    - numpy
+    - torch
+    - torchaudio
+    - torchvision
     - regex
-    - onnx
     - transformers>=4.47.1
     - sentencepiece
     - einops
diff --git a/docker/build.sh b/docker/build.sh
index 82e6551e6..8d477a6c5 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,13 +4,134 @@ set -euo pipefail
 # Usage: ./build.sh <docker_image_name>
 # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
 
+get_cuda_docker_image() {
+  local docker_user="nvidia"
+  local docker_image="cuda"
+  local page=1
+  local per_page=100
+
+  # Determine Ubuntu version: use lsb_release if available, else default to 22.04.
+  local ubuntu_version
+  if command -v lsb_release >/dev/null 2>&1; then
+      ubuntu_version=$(lsb_release -rs)
+  else
+      ubuntu_version="22.04"
+  fi
+
+  # Determine CUDA version.
+  # If the environment variable 'cuda_version' is set (in "<major>.<minor>" format), use that.
+  # Otherwise, use nvidia-smi to extract the CUDA version.
+  local cuda_full_version
+  local installed_major_minor
+  if [[ -n "${cuda_version:-}" ]]; then
+      cuda_full_version="$cuda_version"
+      installed_major_minor="$cuda_version"
+  else
+      if ! command -v nvidia-smi >/dev/null 2>&1; then
+          echo "Error: nvidia-smi not found and cuda_version is not set." >&2
+          return 1
+      fi
+      local nvidia_smi_output
+      nvidia_smi_output=$(nvidia-smi)
+      local cuda_version_line
+      cuda_version_line=$(echo "$nvidia_smi_output" | grep "CUDA Version")
+      cuda_full_version=$(echo "$cuda_version_line" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p')
+      if [[ -z "$cuda_full_version" ]]; then
+          echo "Error: Unable to determine CUDA version from nvidia-smi." >&2
+          return 1
+      fi
+      installed_major_minor="$cuda_full_version"
+  fi
+
+  # Query Docker Hub for matching tags.
+  local -a tags_list=()
+  while true; do
+      local response new_tags
+      response=$(curl -s "https://hub.docker.com/v2/repositories/${docker_user}/${docker_image}/tags?page=${page}&page_size=${per_page}")
+      new_tags=$(echo "$response" | jq -r --arg v "$ubuntu_version" '.results[].name | select(contains("cudnn") and contains("devel-ubuntu") and test("ubuntu"+$v+"$"))')
+      if [[ -z "$new_tags" ]]; then
+          break
+      fi
+      while read -r tag; do
+          tags_list+=("$tag")
+      done <<< "$new_tags"
+      ((page++))
+  done
+
+  if [ ${#tags_list[@]} -eq 0 ]; then
+      echo "Error: No docker images found matching criteria." >&2
+      return 1
+  fi
+
+  # Sort the tags in descending order based on the CUDA version.
+  local sorted_tags
+  sorted_tags=$(printf "%s\n" "${tags_list[@]}" | sort -rV -t '-' -k1,1)
+
+  # Find the most appropriate tag.
+  local selected_tag=""
+  while read -r tag; do
+      local version tag_major_minor
+      version=$(echo "$tag" | cut -d '-' -f1)
+      tag_major_minor=$(echo "$version" | awk -F. '{print $1"."$2}')
+      if [[ "$tag_major_minor" == "$installed_major_minor" ]]; then
+          selected_tag="$tag"
+          break
+      fi
+  done <<< "$sorted_tags"
+
+  # If no exact match, choose the highest version lower than the installed version.
+  if [[ -z "$selected_tag" ]]; then
+      while read -r tag; do
+          local version
+          version=$(echo "$tag" | cut -d '-' -f1)
+          if [[ $(printf '%s\n' "$version" "$cuda_full_version" | sort -V | head -n1) == "$version" && "$version" != "$cuda_full_version" ]]; then
+              selected_tag="$tag"
+              break
+          fi
+      done <<< "$sorted_tags"
+  fi
+
+  if [[ -n "$selected_tag" ]]; then
+      echo "${docker_user}/${docker_image}:${selected_tag}"
+      return 0
+  else
+      echo "Error: No suitable docker image found." >&2
+      return 1
+  fi
+}
+
+set_cuda_version_version() {
+  # If the user provided a cuda_version, use that.
+  if [[ -n "${cuda_version:-}" ]]; then
+    return 0
+  fi
+
+  # Otherwise, check that nvidia-smi is available.
+  if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "Error: nvidia-smi not found and cuda_version is not set." >&2
+    return 1
+  fi
+
+  # Extract the CUDA version from nvidia-smi output.
+  local nvidia_output cuda_ver
+  nvidia_output=$(nvidia-smi)
+  cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p')
+  
+  if [[ -z "$cuda_ver" ]]; then
+    echo "Error: Unable to detect CUDA version from nvidia-smi." >&2
+    return 1
+  fi
+
+  export cuda_version="$cuda_ver"
+  return 0
+}
+
 # Cd into flexflow-serve. Assumes this script is in flexflow-serve/docker
 cd "${BASH_SOURCE[0]%/*}/.."
 
 # Parse input params
 image=${1:-flexflow}
 FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
-cuda_version=${cuda_version:-"empty"}
 hip_version=${hip_version:-"empty"}
 python_version=${python_version:-latest}
 
@@ -36,36 +157,10 @@ ff_environment_base_image="ubuntu:20.04"
 gpu_backend_version=""
 
 if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
-  # Autodetect cuda version if not specified
-  if [[ $cuda_version == "empty" ]]; then
-    # shellcheck disable=SC2015
-    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
-    # Change cuda_version eg. V11.7.99 to 11.7
-    cuda_version=${cuda_version:1:4}
-    if [[ -z "$cuda_version" ]]; then
-      echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
-      exit 1
-    fi
-  fi
-  # Check that CUDA version is supported, and modify cuda version to include default subsubversion
-  if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then
-    cuda_version_input=${cuda_version}.1
-  elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then 
-    cuda_version_input=${cuda_version}.2
-  elif [[ "$cuda_version" == @(11.4) ]]; then 
-    cuda_version_input=${cuda_version}.3
-  elif [[ "$cuda_version" == @(11.8) ]]; then 
-    cuda_version_input=${cuda_version}.0
-  elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
-    cuda_version=12.2
-    cuda_version_input=${cuda_version}.2
-  else
-    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
-    exit 1
-  fi
-  echo "Building $image docker image with CUDA $cuda_version"
-  ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
+  ff_environment_base_image=$(get_cuda_docker_image) || { echo "Failed to get docker image." >&2; exit 1; }
+  echo "Using base docker image: $ff_environment_base_image"
+  set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; }
+  echo "GPU Backend Version is set to: $cuda_version"
   gpu_backend_version="-${cuda_version}"
 fi
 
@@ -90,6 +185,7 @@ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]];
   if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
     gpu_backend_version="-${hip_version}"
   fi
+  cuda_version="empty"
 fi
 
 # Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking
@@ -97,7 +193,7 @@ cores_available=$(nproc --all)
 n_build_cores=$(( cores_available -1 ))
 
 # check python_version
-if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then
+if [[ "$python_version" != @(3.8|3.9|3.10|3.11|3.12|latest) ]]; then
   echo "python_version not supported!"
   exit 0
 fi
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 25c45646f..92423adf2 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -19,10 +19,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut
 
 # Install Python3 with Miniconda
 ARG python_version "latest"
-#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
-RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
-    if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \
-        echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \
+RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
+    if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "3.12" ] && [ "$python_version" != "latest" ]; then \
+        echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11, 3.12 or latest (default)}"; \
         exit 1; \
     fi; \
     if [ "${python_version}" = "3.8" ]; then \
@@ -33,6 +32,10 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
         MINICONDA_SCRIPT_NAME=Miniconda3-py310_23.5.2-0-Linux-x86_64.sh; \
     elif [ "${python_version}" = "3.11" ]; then \
         MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
+    elif [ "${python_version}" = "3.12" ]; then \
+        MINICONDA_SCRIPT_NAME=Miniconda3-py312_25.1.1-2-Linux-x86_64.sh; \
+    elif [ "${python_version}" = "latest" ]; then \
+        MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
     fi; \
     wget -c -q https://repo.continuum.io/miniconda/${MINICONDA_SCRIPT_NAME} && \
         mv ./${MINICONDA_SCRIPT_NAME} ~/${MINICONDA_SCRIPT_NAME} && \
@@ -42,7 +45,8 @@ RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
 	/opt/conda/bin/conda config --set solver classic && \
         /opt/conda/bin/conda upgrade --all && \
         /opt/conda/bin/conda install conda-build conda-verify && \
-        /opt/conda/bin/conda clean -ya
+        /opt/conda/bin/conda clean -ya && \
+    ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/libstdc++.so.6
 
 # set MAKEFLAGS to speedup any dependency that uses make
 ARG N_BUILD_CORES
@@ -58,20 +62,6 @@ ARG FF_GPU_BACKEND "cuda"
 ARG cuda_version ""
 ARG hip_version "5.6"
 
-# Update NCCL if FF_GPU_BACKEND is cuda
-# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
-#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
-#         ubuntu_version=$(lsb_release -rs); \
-#         ubuntu_version=${ubuntu_version//./}; \
-#         wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
-#         DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
-#         DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
-#         rm -f cuda-keyring_1.0-1_all.deb; \
-#         DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
-#     else \
-#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
-#     fi'
-
 # Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
 # package attempts to re-install cuda even though cuda is already installed
@@ -93,22 +83,21 @@ RUN  if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]
         elif [ "$hip_version" = "5.5" ]; then \
             AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb; \
         fi; \
-        AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"; \
+        ubuntu_codename=$(lsb_release -cs); \
+        AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/${ubuntu_codename}/${AMD_GPU_SCRIPT_NAME}"; \
         # Download and install AMD GPU software with ROCM and HIP support
         wget $AMD_GPU_SCRIPT_URL; \
         apt-get install -y ./${AMD_GPU_SCRIPT_NAME}; \
         rm ./${AMD_GPU_SCRIPT_NAME}; \
         amdgpu-install -y --usecase=hip,rocm --no-dkms; \
         apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \
-        # Install protobuf dependencies
-        apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \
     else \
         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \
     fi
 RUN rm -rf /var/lib/apt/lists/*
 
 # Install python packages and other dependencies
-RUN conda install -c conda-forge cmake make pillow cmake-build-extension numpy pandas keras-preprocessing
+RUN conda install -c conda-forge cmake make cmake-build-extension numpy pandas
 # Install Pytorch
 COPY docker/flexflow-environment/install_pytorch.sh /usr/local/bin/install_pytorch.sh
 RUN if [ "$FF_GPU_BACKEND" == "cuda" ] ; then \
diff --git a/docker/publish.sh b/docker/publish.sh
index c70419a9c..bfbbbc6bf 100755
--- a/docker/publish.sh
+++ b/docker/publish.sh
@@ -4,13 +4,39 @@ set -euo pipefail
 # Usage: ./publish.sh <docker_image_name>
 # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
 
+set_cuda_version_version() {
+  # If the user provided a cuda_version, use that.
+  if [[ -n "${cuda_version:-}" ]]; then
+    return 0
+  fi
+
+  # Otherwise, check that nvidia-smi is available.
+  if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "Error: nvidia-smi not found and cuda_version is not set." >&2
+    return 1
+  fi
+
+  # Extract the CUDA version from nvidia-smi output.
+  local nvidia_output cuda_ver
+  nvidia_output=$(nvidia-smi)
+  cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p')
+  
+  if [[ -z "$cuda_ver" ]]; then
+    echo "Error: Unable to detect CUDA version from nvidia-smi." >&2
+    return 1
+  fi
+
+  export cuda_version="$cuda_ver"
+  return 0
+}
+
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
 # Parse input params
 image=${1:-flexflow}
 FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
-cuda_version=${cuda_version:-"empty"}
 hip_version=${hip_version:-"empty"}
 
 # Check docker image name
@@ -33,20 +59,11 @@ else
 fi
 
 if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
-  # Autodetect cuda version if not specified
-  if [[ $cuda_version == "empty" ]]; then
-    # shellcheck disable=SC2015
-    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
-    # Change cuda_version eg. V11.7.99 to 11.7
-    cuda_version=${cuda_version:1:4}
-    if [[ -z "$cuda_version" ]]; then
-      echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
-      exit 1
-    fi
-  fi
+  set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; }
+  
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
-    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
+  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8) ]]; then
+    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8}"
     exit 1
   fi
   # Set cuda version suffix to docker image name
diff --git a/docker/pull.sh b/docker/pull.sh
index f641e1a59..0ac456fa9 100755
--- a/docker/pull.sh
+++ b/docker/pull.sh
@@ -4,13 +4,39 @@ set -euo pipefail
 # Usage: ./pull.sh <docker_image_name>
 # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
 
+set_cuda_version_version() {
+  # If the user provided a cuda_version, use that.
+  if [[ -n "${cuda_version:-}" ]]; then
+    return 0
+  fi
+
+  # Otherwise, check that nvidia-smi is available.
+  if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "Error: nvidia-smi not found and cuda_version is not set." >&2
+    return 1
+  fi
+
+  # Extract the CUDA version from nvidia-smi output.
+  local nvidia_output cuda_ver
+  nvidia_output=$(nvidia-smi)
+  cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p')
+  
+  if [[ -z "$cuda_ver" ]]; then
+    echo "Error: Unable to detect CUDA version from nvidia-smi." >&2
+    return 1
+  fi
+
+  export cuda_version="$cuda_ver"
+  return 0
+}
+
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
 # Parse input params
 image=${1:-flexflow}
 FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
-cuda_version=${cuda_version:-"empty"}
 hip_version=${hip_version:-"empty"}
 
 # Check docker image name
@@ -33,26 +59,12 @@ fi
 gpu_backend_version=""
 
 if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
-  # Autodetect cuda version if not specified
-  if [[ $cuda_version == "empty" ]]; then
-    # shellcheck disable=SC2015
-    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
-    # Change cuda_version eg. V11.7.99 to 11.7
-    cuda_version=${cuda_version:1:4}
-    if [[ -z "$cuda_version" ]]; then
-      echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
-      exit 1
-    fi
-  fi
+  set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; }
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
-    echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}"
+  if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8) ]]; then
+    echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8}"
     exit 1
   fi
-  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
-  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.2
-  fi
   # Set cuda version suffix to docker image name
   echo "Downloading $image docker image with CUDA $cuda_version"
   gpu_backend_version="-${cuda_version}"
diff --git a/docker/run.sh b/docker/run.sh
index d6c8e4fad..72bc708db 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -4,13 +4,38 @@ set -euo pipefail
 # Usage: ./run.sh <docker_image_name>
 # Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version, ATTACH_GPUS, SHM_SIZE
 
+set_cuda_version_version() {
+  # If the user provided a cuda_version, use that.
+  if [[ -n "${cuda_version:-}" ]]; then
+    return 0
+  fi
+
+  # Otherwise, check that nvidia-smi is available.
+  if ! command -v nvidia-smi >/dev/null 2>&1; then
+    echo "Error: nvidia-smi not found and cuda_version is not set." >&2
+    return 1
+  fi
+
+  # Extract the CUDA version from nvidia-smi output.
+  local nvidia_output cuda_ver
+  nvidia_output=$(nvidia-smi)
+  cuda_ver=$(echo "$nvidia_output" | grep "CUDA Version" | sed -n 's/.*CUDA Version:\s*\([0-9]\+\.[0-9]\+\).*/\1/p')
+  
+  if [[ -z "$cuda_ver" ]]; then
+    echo "Error: Unable to detect CUDA version from nvidia-smi." >&2
+    return 1
+  fi
+
+  export cuda_version="$cuda_ver"
+  return 0
+}
+
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
 # Parse input params
 image=${1:-flexflow}
 FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
-cuda_version=${cuda_version:-"empty"}
 hip_version=${hip_version:-"empty"}
 
 # Parameter controlling whether to attach GPUs to the Docker container
@@ -49,28 +74,8 @@ fi
 gpu_backend_version=""
 
 if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
-  # Autodetect cuda version if not specified
-  if [[ $cuda_version == "empty" ]]; then
-    # shellcheck disable=SC2015
-    cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
-    # Change cuda_version eg. V11.7.99 to 11.7
-    cuda_version=${cuda_version:1:4}
-    if [[ -z "$cuda_version" ]]; then
-      echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
-      exit 1
-    fi
-  fi
-  # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
-    exit 1
-  fi
-  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
-  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.2
-  fi
-  # Set cuda version suffix to docker image name
-  echo "Running $image docker image with CUDA $cuda_version"
+  set_cuda_version_version || { echo "Failed to set gpu_backend_version." >&2; exit 1; }
+  echo "Running $image docker image with CUDA: $cuda_version"
   gpu_backend_version="-${cuda_version}"
 fi
 
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 5cbe395c1..2e9602e15 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -64,14 +64,12 @@ class BatchConfig {
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
-
   int finetuning_request_index() const;
   int num_finetuning_fwd_requests() const;
   int num_finetuning_fwd_tokens() const;
   int num_finetuning_bwd_requests() const;
   int num_finetuning_bwd_tokens() const;
   bool peft_bwd_applies_to_this_layer(int layer) const;
-
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
@@ -91,8 +89,6 @@ class BatchConfig {
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
   static int const MAX_PEFT_CONFIG_SIZE = 1024;
 
-  // number of tokens in prompt phase, start offset of tokens in inc_decoding
-  // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
   int num_tokens = 0, num_generation_tokens = 0;
 
   struct PerRequestInfo {
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 2cba1cbb9..1deb7ad83 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -95,6 +95,10 @@ bool flexflow_config_get_enable_peft(flexflow_config_t handle_);
 bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_);
 void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value);
 
+bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_);
+void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_,
+                                                bool value);
+
 void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
                                                  int value);
 
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index e0f4dd0ba..7a0187712 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -250,6 +250,8 @@ class Op {
                                      std::vector<ParallelTensor> const &,
                                      MachineView const *mv = nullptr) {
     assert(false);
+    Legion::FutureMap empty_map;
+    return empty_map;
   }
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 594c43053..d7d4c10d3 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -185,6 +185,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   void *qk_prods, *qk_prods_softmax;
   void *attn_heads;
   BatchConfig::PerTokenInfo *token_infos;
+  BatchConfig::PerTokenInfo *peft_token_infos;
+  BatchConfig::PerTokenInfo *peft_token_infos_device;
   BatchConfig::PerRequestInfo *request_infos;
   DataType quantization_type;
   bool offload;
@@ -204,7 +206,8 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   // PEFT specific fields
   void *softmax_activation_buffer;
   void *query_activation_buffer;
-  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0,
+         peft_token_infos_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index 0284970d5..b7deeee6c 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -96,7 +96,7 @@ void store_peft_activations(LinearMeta const *m,
                             BatchConfig const *bc,
                             size_t out_dim,
                             DT *output_ptr,
-                            cudaStream_t stream);
+                            ffStream_t stream);
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
                      BatchConfig const *bc,
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 5d34b73fb..7279edb80 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -81,6 +81,12 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
                      DT *input_grad_ptr,
                      int num_classes,
                      ffStream_t stream);
+template <typename DT>
+void store_peft_activations(SoftmaxMeta *m,
+                            BatchConfig const *bc,
+                            int num_classes,
+                            DT *output_ptr,
+                            ffStream_t stream);
 
 template <typename DT>
 void store_peft_activations(SoftmaxMeta *m,
diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h
index 1d3f5d421..7cb09e7dc 100644
--- a/include/flexflow/parallel_ops/allreduce.h
+++ b/include/flexflow/parallel_ops/allreduce.h
@@ -16,6 +16,7 @@ class AllReduce : public ParallelOp {
   using Input = ParallelTensor;
 
   AllReduce(FFModel &model,
+            LayerID const &_layer_guid,
             const ParallelTensor input,
             int allreduce_legion_dim,
             char const *name = NULL);
@@ -75,6 +76,7 @@ class AllReduce : public ParallelOp {
   Params get_params() const;
 
 public:
+  LayerID layer_guid;
   int allreduce_dim;
 };
 
diff --git a/include/flexflow/parallel_ops/allreduce_params.h b/include/flexflow/parallel_ops/allreduce_params.h
index a0daac8f9..32713778b 100644
--- a/include/flexflow/parallel_ops/allreduce_params.h
+++ b/include/flexflow/parallel_ops/allreduce_params.h
@@ -4,6 +4,7 @@
 namespace FlexFlow {
 
 struct AllReduceParams {
+  LayerID layer_guid;
   int allreduce_legion_dim;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h
index 424cc9b28..f5ef39055 100644
--- a/include/flexflow/parallel_ops/parallel_identity.h
+++ b/include/flexflow/parallel_ops/parallel_identity.h
@@ -16,6 +16,7 @@ class ParallelIdentity : public ParallelOp {
   using Input = ParallelTensor;
 
   ParallelIdentity(FFModel &model,
+                   LayerID const &_layer_guid,
                    const ParallelTensor input,
                    int parallel_identity_legion_dim,
                    char const *name = NULL);
@@ -75,6 +76,7 @@ class ParallelIdentity : public ParallelOp {
   Params get_params() const;
 
 public:
+  LayerID layer_guid;
   int parallel_identity_dim;
 };
 
diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h
index 6eeed662e..d11ddb5f2 100644
--- a/include/flexflow/parallel_ops/parallel_identity_params.h
+++ b/include/flexflow/parallel_ops/parallel_identity_params.h
@@ -4,6 +4,7 @@
 namespace FlexFlow {
 
 struct ParallelIdentityParams {
+  LayerID layer_guid;
   int parallel_identity_legion_dim;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 8b07a12d5..8b9bd7552 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -82,7 +82,6 @@ struct InferenceReqProfileInfo {
   int decoding_step_idx;
   long long timestamp;
 };
-
 struct Request {
   enum Status {
     PENDING = 101,   // loading prompt
@@ -133,10 +132,8 @@ struct Request {
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
   std::vector<struct BeamTree> beam_trees;
-
   Request() = default;
   static Request from_other(Request const &other);
-
   friend std::ostream &operator<<(std::ostream &os, Request const &req);
 };
 
@@ -173,7 +170,6 @@ class RequestManager {
   size_t get_num_ssms();
 
   bool load_request_token_ids(Request &request);
-
   void set_verbose(bool verbose);
   void set_max_requests_per_batch(int max_num_requests);
   int get_max_requests_per_batch();
@@ -186,8 +182,6 @@ class RequestManager {
   int get_max_verify_tokens_per_batch();
   int get_max_sequence_length();
   void set_max_sequence_length(int max_seq_length);
-  int get_max_finetuning_sequence_length();
-  void set_max_finetuning_sequence_length(int max_seq_length);
   void push_spec_infer_tree_width(int tree_width);
   void set_enable_peft_finetuning(bool enable_peft_finetuning_);
   void set_inference_finished(bool finished = true);
@@ -244,7 +238,6 @@ class RequestManager {
   void add_peft_config_to_request_info(BatchConfig &bc,
                                        int req_idx,
                                        LoraLinearConfig const &peft_config);
-
   // helpers for prepare_next_batch
   void process_inf_req_progress(BatchConfig const &old_fwd_bc,
                                 InferenceResult const &result);
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
index 623e78c52..1fff8cbf1 100644
--- a/inference/peft/CMakeLists.txt
+++ b/inference/peft/CMakeLists.txt
@@ -124,4 +124,4 @@ target_include_directories(${project_target5} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} $
 target_include_directories(${project_target5} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
 target_link_libraries(${project_target5} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
 set(BIN_DEST "bin")
-install(TARGETS ${project_target5} DESTINATION ${BIN_DEST})
\ No newline at end of file
+install(TARGETS ${project_target5} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 09dcc68ee..de922eb51 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -34,7 +34,8 @@ struct FilePaths {
   std::string cache_folder_path;
   std::string prompt_file_path;
   std::string dataset_file_path;
-  std::string output_folder_path;
+  std::string output_file_path;
+  std::string profiling_folder_path;
 };
 
 void parse_input_args(char **argv,
@@ -52,7 +53,8 @@ void parse_input_args(char **argv,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
                       int &max_training_steps,
-                      int &num_layers_per_finetuning_step) {
+                      int &num_layers_per_finetuning_step,
+                      bool &run_warmup) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -93,10 +95,18 @@ void parse_input_args(char **argv,
       paths.output_folder_path = std::string(argv[++i]);
       continue;
     }
+    if (!strcmp(argv[i], "-profiling-folder")) {
+      paths.profiling_folder_path = std::string(argv[++i]);
+      continue;
+    }
     if (!strcmp(argv[i], "--use-full-precision")) {
       use_full_precision = true;
       continue;
     }
+    if (!strcmp(argv[i], "--warmup")) {
+      run_warmup = true;
+      continue;
+    }
     // verbose logging to stdout
     if (!strcmp(argv[i], "--verbose")) {
       verbose = true;
@@ -147,44 +157,6 @@ void parse_input_args(char **argv,
   wordfree(&p);
 }
 
-std::vector<Request> parse_trace_file(std::string const &trace_file_path) {
-  using json = nlohmann::json;
-  std::ifstream file_handle(trace_file_path);
-  assert(file_handle.good() && "Trace file does not exist.");
-  nlohmann::ordered_json prompt_json =
-      nlohmann::ordered_json::parse(file_handle,
-                                    /*parser_callback_t */ nullptr,
-                                    /*allow_exceptions */ true,
-                                    /*ignore_comments */ true);
-  file_handle.close();
-  auto &metadata = prompt_json["metadata"];
-  int num_warmup_requests = metadata["num_warmup_requests"];
-  int num_regular_requests = 0, total_requests = 0;
-  std::vector<Request> warmup_requests, requests;
-  for (auto &entry : prompt_json["entries"]) {
-    int prompt_length = entry["prompt_length"];
-    int response_length = entry["response_length"];
-    std::string text = entry["prompt"];
-    bool is_warmup_request = total_requests < num_warmup_requests;
-
-    Request inference_req;
-    inference_req.prompt = text;
-    inference_req.add_special_tokens = false;
-    inference_req.max_new_tokens = response_length;
-
-    if (is_warmup_request) {
-      warmup_requests.push_back(inference_req);
-    } else {
-      // printf("Prompt[%d]: %s\n", total_requests, text.c_str());
-      requests.push_back(inference_req);
-      num_regular_requests++;
-    }
-
-    total_requests++;
-  }
-  return requests;
-}
-
 std::vector<Request> make_warmup_requests(int num_inf_request,
                                           int num_finetuning_steps,
                                           PEFTModelID *peft_model_id) {
@@ -228,9 +200,10 @@ void FlexFlow::top_level_task(Task const *task,
   int max_requests_per_batch = 1;
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
-  int max_training_steps = -1;
+  int max_training_steps = 2;
   bool enable_peft_finetuning = true;
   int num_layers_per_finetuning_step = -1;
+  bool run_warmup = false;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -250,7 +223,8 @@ void FlexFlow::top_level_task(Task const *task,
                    max_tokens_per_batch,
                    max_sequence_length,
                    max_training_steps,
-                   num_layers_per_finetuning_step);
+                   num_layers_per_finetuning_step,
+                   run_warmup);
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
          ffconfig.numNodes * ffconfig.workersPerNode);
@@ -417,16 +391,23 @@ void FlexFlow::top_level_task(Task const *task,
   rm->start_background_server(&model);
 
   // Add PEFT adapter(s)
-  // PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
-  // if (!peft_model_name.empty()) {
-  //   peft_model_id = model.register_peft_adapter(peft_config);
-  //   if (enable_peft_finetuning) {
-  //     peft_model_id_finetuning =
-  //         model.register_peft_adapter(peft_config_finetuning);
-  //   }
-  // }
-  PEFTModelID *peft_model_id_finetuning =
-      model.register_peft_adapter(peft_config_finetuning);
+  PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
+  if (!peft_model_name.empty() && !enable_peft_finetuning) {
+    peft_model_id = model.register_peft_adapter(peft_config);
+  }
+  if (enable_peft_finetuning) {
+    peft_model_id_finetuning =
+        model.register_peft_adapter(peft_config_finetuning);
+  }
+
+  if (run_warmup) {
+    std::vector<Request> warmup_requests =
+        make_warmup_requests(10, 1000, peft_model_id_finetuning);
+    std::vector<GenerationResult> warmup_result =
+        model.generate(warmup_requests);
+    rm->set_inference_finished(false); // reset inference finished flag
+    std::cout << "----------warmup finished--------------" << std::endl;
+  }
 
   // Run workload
   {
@@ -452,6 +433,23 @@ void FlexFlow::top_level_task(Task const *task,
     finetuning_req.peft_finetuning_info.max_training_steps = max_training_steps;
     requests.push_back(finetuning_req);
 
+    // Add fine-tuning request
+    if (enable_peft_finetuning) {
+      assert(!file_paths.dataset_file_path.empty() &&
+             "Dataset file path is required for fine-tuning.");
+      printf("Finetuning request with dataset %s\n",
+             file_paths.dataset_file_path.c_str());
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr)
+                                          ? *peft_model_id_finetuning
+                                          : PEFTModelID::NO_ID;
+      fine_tuning_req.peft_finetuning_info.dataset_filepath =
+          file_paths.dataset_file_path;
+      fine_tuning_req.peft_finetuning_info.max_training_steps =
+          max_training_steps;
+      requests.push_back(fine_tuning_req);
+    }
     std::vector<GenerationResult> result = model.generate(requests);
   }
 
@@ -474,6 +472,32 @@ void FlexFlow::top_level_task(Task const *task,
                                  0.0, // arrival rate
                                  10); // num_warmup_requests
 
+  if (!file_paths.profiling_folder_path.empty()) {
+    std::cout << "Saving profiling info..." << std::endl;
+    std::string dataset_name;
+    // set dataset name to "wildchat" if the prompt file path contains
+    // "wildchat"
+    if (file_paths.prompt_file_path.find("wildchat") != std::string::npos) {
+      dataset_name = "wildchat";
+    } else if (file_paths.prompt_file_path.find("sharegpt") !=
+               std::string::npos) {
+      dataset_name = "sharegpt";
+    } else {
+      dataset_name = "unknown";
+    }
+    rm->save_profiling_info_to_csv(file_paths.profiling_folder_path,
+                                   dataset_name,
+                                   llm_model_name,
+                                   model.config.tensor_parallelism_degree,
+                                   max_requests_per_batch,
+                                   max_tokens_per_batch,
+                                   0.0, // arrival rate
+                                   10); // num_warmup_requests
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
   if (peft_model_id_finetuning != nullptr) {
     free(peft_model_id_finetuning);
   }
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 8ca2c97e4..d88a2852b 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -825,7 +825,7 @@ def enable_peft_finetuning(self, value):
         if type(value) is not bool:
             raise ValueError("enable_peft_finetuning must be specified as a boolean value")
         ffc().flexflow_config_set_enable_peft_finetuning(self.handle, value)
-
+    
     @property
     def cpu_offload(self):
         return ffc().flexflow_config_get_offload(self.handle)
@@ -1643,6 +1643,15 @@ def set_max_sequence_length(self, max_length):
 
     def get_max_sequence_length(self):
         return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)
+
+    def set_num_transformers_layers(self, num_layers):
+        return ffc().flexflow_request_manager_set_num_transformers_layers(
+            self.handle, num_layers
+        )
+    def set_num_layers_per_finetuning_step(self, num_layers):
+        return ffc().flexflow_request_manager_set_num_layers_per_finetuning_step(
+            self.handle, num_layers
+        )
     
     def set_max_finetuning_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_finetuning_sequence_length(
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 24c5f210b..6c547d295 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -90,7 +90,7 @@ def init(
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
     - fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
-    - log_instance_creation: whether to log the creation of FlexFlow instances, defaults to False
+    - log_instance_cration: whether to log the creation of the FlexFlow instances, defaults to False
 
     The configurations are passed down to the FlexFlow runtime (implemented in C++) via command line arguments.
 
@@ -133,7 +133,7 @@ def init(
     :type inference_debugging: Optional[bool], optional
     :param fusion: whether to enable the FlexFlow operator fusion optimization, defaults to True
     :type fusion: Optional[bool], optional
-    :param log_instance_cration: whether to log the creation of FlexFlow instances, defaults to False
+    :param log_instance_cration: whether to log the creation of the FlexFlow instances, defaults to False
     :type log_instance_cration: Optional[bool], optional
 
     :raises ValueError: this function will raise an exception if the user passes both a configs_dict and some named parameters
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 68b226796..394869426 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -259,17 +259,15 @@ def __get_resource_path(
 
     def __is_empty_dir(self, folder: str) -> bool:
         """Check whether a folder only contains the rev_sha.txt file
-
         Args:
             folder (str): Path to the folder to check
-
         Returns:
             bool: True if the folder is empty, False otherwise
         """
         if not os.path.isdir(folder) or not os.path.exists(folder):
             return True
         return len(os.listdir(folder)) == 1 and "rev_sha.txt" in os.listdir(folder)
-        
+    
     def __need_cache_refresh(
         self, model_name: str, resource_type: CachedResourceType
     ) -> bool:
@@ -285,7 +283,6 @@ def __need_cache_refresh(
         """
         resource_path = self.__get_resource_path(model_name, resource_type)
         ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path)
-        
         if self.refresh_cache or not os.path.exists(resource_path) or self.__is_empty_dir(resource_path) or ff_revision != latest_revision:
             print(
                 f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..."
@@ -444,7 +441,6 @@ def compile(
         max_tokens_per_batch: int = 64,
         max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        max_finetuning_seq_length: int = -1,
         num_bwd_layers_per_ft_step: int = -1,
         ssms: list = [],
     ):
@@ -462,8 +458,6 @@ def compile(
         :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length)
-        :type max_finetuning_seq_length: int, optional
         :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
         :type num_bwd_layers_per_ft_step: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
@@ -494,10 +488,6 @@ def compile(
         self.rm.set_max_sequence_length(max_seq_length)
         self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
         self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
-        if max_finetuning_seq_length == -1:
-            self.rm.set_max_finetuning_sequence_length(max_seq_length)
-        else:
-            self.rm.set_max_finetuning_sequence_length(max_finetuning_seq_length)
         self.rm.set_num_transformers_layers(self.hf_config.num_hidden_layers)
         if num_bwd_layers_per_ft_step != -1:
             self.rm.set_num_layers_per_finetuning_step(num_bwd_layers_per_ft_step)
@@ -771,7 +761,6 @@ def compile(
         max_tokens_per_batch: int = 2048,
         max_concurrent_adapters: int = 1,
         enable_peft_finetuning: bool = False,
-        max_finetuning_seq_length: int = -1,
         num_bwd_layers_per_ft_step: int = -1,
         ssms: list = [],
     ):
@@ -788,8 +777,6 @@ def compile(
         :type max_concurrent_adapters: int, optional
         :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
         :type enable_peft_finetuning: bool, optional
-        :param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length)
-        :type max_finetuning_seq_length: int, optional
         :param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
         :type num_bwd_layers_per_ft_step: int, optional
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
@@ -802,7 +789,6 @@ def compile(
             max_tokens_per_batch,
             max_concurrent_adapters,
             enable_peft_finetuning,
-            max_finetuning_seq_length,
             num_bwd_layers_per_ft_step,
             ssms,
         )
diff --git a/requirements.txt b/requirements.txt
index 34d0a39bf..87e9b9695 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,12 @@
-cffi>=1.11.0
-numpy>=1.16.0
-qualname>=0.1.0
-keras_preprocessing>=1.1.2
-Pillow
+cffi
+numpy
 cmake-build-extension
 ninja
 requests
 regex
-torch>=1.13.1
-torchaudio>=0.13.1
-torchvision>=0.14.1
-onnx
+torch
+torchaudio
+torchvision
 transformers>=4.47.1
 sentencepiece
 einops
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 406513a85..55d19763c 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -181,7 +181,8 @@ bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_) {
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
   return handle->enable_peft_finetuning;
 }
-void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value) {
+void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_,
+                                                bool value) {
   FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
   handle->enable_peft_finetuning = value;
 }
@@ -2818,7 +2819,7 @@ void flexflow_request_manager_set_num_transformers_layers(
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
   handle->set_num_transformer_layers(num_transformers_layers_);
   DEBUG_PRINT("[RequestManager] set num_transformers_layers %d",
-    num_transformers_layers_);
+              num_transformers_layers_);
 }
 
 void flexflow_request_manager_set_num_layers_per_finetuning_step(
@@ -2826,7 +2827,7 @@ void flexflow_request_manager_set_num_layers_per_finetuning_step(
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
   handle->set_num_layers_per_finetuning_step(num_layers_per_finetuning_step_);
   DEBUG_PRINT("[RequestManager] set num layers per finetuning step %d",
-    num_layers_per_finetuning_step_);
+              num_layers_per_finetuning_step_);
 }
 
 void flexflow_request_manager_register_tokenizer(
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index b58b6a45c..550a30d1b 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -41,10 +41,9 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -222,7 +221,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index de4357a43..16629f493 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -40,10 +40,9 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
 
@@ -219,7 +218,7 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
@@ -809,4 +808,4 @@ void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
   }
 }
 
-}; // namespace FlexFlow
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 50ba64d8c..9c8ba7c5f 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -560,4 +560,4 @@ ArgMaxMeta::~ArgMaxMeta(void) {
     reserveInst.destroy();
   }
 }
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 12bfa6052..e3b22af0d 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -275,4 +275,4 @@ ArgMaxMeta::~ArgMaxMeta(void) {
     reserveInst.destroy();
   }
 }
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 7b6f4454a..a57ff2782 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -97,11 +97,8 @@ __host__ void
 
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  bool softmax_grad_additional_region =
-      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
-  assert((int)regions.size() == fused->numInputs + fused->numWeights +
-                                    fused->numOutputs +
-                                    softmax_grad_additional_region);
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
@@ -589,23 +586,13 @@ __host__ void
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain.get_volume() ==
                my_output_accessor[0].domain.get_volume());
-        if (op == fused->numOperators - 1) { // if this is the final operator
-          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
-              fused->output_data_types[fused->numOutputs - 1],
-              regions[roff],
-              task->regions[roff],
-              FID_DATA,
-              ctx,
-              runtime);
-        }
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
         Kernels::Softmax::inference_kernel_wrapper(
             m,
             bc,
             (op == fused->numOperators - 1),
             my_input_accessor[0],
-            my_output_accessor[0],
-            output_accessor[fused->numOutputs]);
+            my_output_accessor[0]);
         break;
       }
       case OP_ALLREDUCE: {
@@ -680,7 +667,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   // Return if no active PEFT bwd tokens
   if (bc->num_finetuning_bwd_tokens() == 0) {
-    return;
+    return false;
   }
 
   assert(metas->numOperators == fused->numOperators);
@@ -790,38 +777,6 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
       my_output_grad_accessor[i] = output_grad_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
-      case OP_CONCAT: {
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        // TODO: implement this
-        assert(false);
-        // ConcatMeta *m = (ConcatMeta *)metas->meta[op];
-        // int num_inputs = fused->op_num_inputs[op];
-        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
-        //                                          my_output_accessor[0],
-        //                                          my_input_accessor,
-        //                                         num_inputs,
-        //                                          m->legion_axis);
-        break;
-      }
-      case OP_BATCHNORM: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
-        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 2);
-        assert(my_weight_accessor[1].domain.get_dim() == 2);
-        // TODO: implement this
-        assert(false);
-        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
-        // BatchNorm::peft_bwd_kernel_kernel(
-        //     m,
-        //     my_input_accessor[0].get_float_ptr(),
-        //     my_output_accessor[0].get_float_ptr(),
-        //     my_weight_accessor[0].get_float_ptr(),
-        //     my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -836,7 +791,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
         assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
@@ -848,9 +803,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
                                                  my_output_grad_accessor[0].ptr,
                                                  my_weight_accessor[0].ptr,
                                                  in_dim,
-                                                 out_dim,
-                                                 num_infr_tokens,
-                                                 num_peft_tokens);
+                                                 out_dim);
         break;
       }
       case OP_LORA: {
@@ -868,7 +821,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
         assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
@@ -884,46 +837,8 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
             shard_id,
             my_input_grad_accessor[0],
             my_output_grad_accessor[0]);
-        break;
-      }
-      case OP_BATCHMATMUL: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        Domain out_domain = my_output_grad_accessor[0].domain;
-        Domain a_domain = my_input_grad_accessor[0].domain;
-        Domain b_domain = my_input_grad_accessor[1].domain;
-        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
-        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
-        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
-        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
-        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
-        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
-        assert(a_domain.get_dim() == b_domain.get_dim());
-        assert(a_domain.get_dim() == out_domain.get_dim());
-        int batch = 1;
-        for (int i = 2; i < a_domain.get_dim(); i++) {
-          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
-          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
-          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
-          batch *= dim_size;
-        }
-        // TODO: implement me
-        assert(false);
-        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
-        // Kernels::BatchMatmul::backward_kernel_wrapper(
-        //     meta,
-        //     my_output_accessor[0].get_float_ptr(),
-        //     my_input_accessor[0].get_float_ptr(),
-        //     my_input_accessor[1].get_float_ptr(),
-        //     (float const *)nullptr,
-        //     m,
-        //     n,
-        //     k,
-        //     batch,
-        //     meta->a_seq_length_dim,
-        //     meta->b_seq_length_dim,
-        //     fused->iter_config.seq_length);
+        Kernels::LoraLinear::save_peft_weights_if_needed(
+            m, bc, in_dim, out_dim, shard_id);
         break;
       }
       case OP_EW_ADD:
@@ -951,37 +866,6 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         // so we do nothing for embedding
         break;
       }
-      case OP_GELU:
-      case OP_RELU:
-      case OP_SIGMOID:
-      case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_grad_accessor[0].domain ==
-               my_output_grad_accessor[0].domain);
-        // TODO: implement me
-        assert(false);
-        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        //   if (m->data_type == DT_HALF) {
-        //     ElementUnary::forward_kernel_wrapper(
-        //         m,
-        //         my_input_accessor[0].get_half_ptr(),
-        //         my_output_accessor[0].get_half_ptr(),
-        //         my_input_accessor[0].domain.get_volume());
-        //   } else if (m->data_type == DT_FLOAT) {
-        //     ElementUnary::forward_kernel_wrapper(
-        //         m,
-        //         my_input_accessor[0].get_float_ptr(),
-        //         my_output_accessor[0].get_float_ptr(),
-        //         my_input_accessor[0].domain.get_volume());
-        //   } else {
-        //     assert(false && "Unsupported data type in ElementUnary forward");
-        //   }
-        break;
-      }
       case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 1);
@@ -989,7 +873,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
                                                   bc,
@@ -1005,7 +889,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
             m,
@@ -1024,7 +908,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
             (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         assert(fused->op_num_weights[op] == 0);
         GenericTensorAccessorR biases;
@@ -1037,19 +921,13 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         // biases);
         break;
       }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
-        // TODO: implement me
-        assert(false);
-        break;
-      }
       case OP_LAYERNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
         LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         if (m->elementwise_affine) {
           assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
@@ -1071,7 +949,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
             (ResidualLayerNormMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         if (m->use_two_residuals) {
           assert(fused->op_num_inputs[op] == 3);
@@ -1110,7 +988,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
             (AddBiasResidualLayerNormMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         if (!m->elementwise_affine) {
           assert(fused->op_num_weights[op] == 1); // attn bias
@@ -1140,7 +1018,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
                                                   bc,
@@ -1158,10 +1036,10 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         Kernels::Softmax::peft_bwd_kernel_wrapper(
-            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+            m, bc, my_input_grad_accessor[0]);
         break;
       }
       case OP_ALLREDUCE: {
@@ -1170,7 +1048,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         Kernels::AllReduce::peft_bwd_kernel_wrapper(
             m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
@@ -1182,7 +1060,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
         ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
         if (!bc->peft_bwd_applies_to_this_layer(
                 m->layer_guid.transformer_layer_id)) {
-          return;
+          break;
         }
         runtime->concurrent_task_barrier(ctx);
         Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
@@ -1221,13 +1099,16 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
       }
       assert(task->index_point.get_dim() == 1);
       int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save,
-                                              false);
+      if (bc->peft_bwd_applies_to_this_layer(
+              metas->meta[op]->layer_guid.transformer_layer_id)) {
+        FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                                shard_id,
+                                                bc,
+                                                input_accessors_to_save,
+                                                weight_accessors_to_save,
+                                                output_accessors_to_save,
+                                                false);
+      }
     }
   }
   return true;
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 8dd40c0cb..1ac58afda 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -116,14 +116,14 @@ __global__ void fill_entries_above_diagonal(DT *matrix,
   }
 }
 
-bool is_inf_req_decoding_mode(BatchConfig const *bc, int i) {
-  return !bc->requestsInfo[i].finetuning_request &&
-         !bc->requestsInfo[i].prompt_phase;
+bool is_finetuning_bwd_request(BatchConfig const *bc, int request_id) {
+  return bc->requestsInfo[request_id].finetuning_request &&
+         bc->requestsInfo[request_id].finetuning_backward_phase;
 }
 
-bool is_finetuning_req_bwd_phase(BatchConfig const *bc, int i) {
-  return bc->requestsInfo[i].finetuning_request &&
-         bc->requestsInfo[i].finetuning_backward_phase;
+bool is_decoding_request(BatchConfig const *bc, int request_id) {
+  return !bc->requestsInfo[request_id].finetuning_request &&
+         !bc->requestsInfo[request_id].prompt_phase;
 }
 
 template <typename DT>
@@ -150,29 +150,37 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase &&
-                                     !bc->requestsInfo[i].finetuning_request)) {
+    if (bc->request_completed[i] || is_decoding_request(bc, i) ||
+        is_finetuning_bwd_request(bc, i)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
+    if (num_new_tokens <= 0) {
+      continue;
+    }
     // Copy query to m->query_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].finetuning_request &&
         !bc->requestsInfo[i].finetuning_backward_phase) {
-      int max_peft_tokens = BatchConfig::max_finetuning_sequence_length();
-      // Check that we have at most one request that requires peft_bwd
-      assert(bc->num_finetuning_fwd_requests() == 1);
-      assert(bc->num_finetuning_bwd_requests() == 1);
-      assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
-      assert(!is_finetuning_req_bwd_phase(bc, i));
-      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
+      // int max_peft_tokens = bc->requestsInfo[i].max_length;
+      int max_peft_tokens = BatchConfig::max_sequence_length();
       size_t activation_size_needed =
           sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
-      assert(m->allocated_peft_buffer_size1 == activation_size_needed);
-
+      if (activation_size_needed != m->allocated_peft_buffer_size1) {
+        std::cout << "activation_size_needed: " << activation_size_needed
+                  << std::endl;
+        std::cout << "m->allocated_peft_buffer_size1: "
+                  << m->allocated_peft_buffer_size1 << std::endl;
+        std::cout << "max_peft_tokens: " << max_peft_tokens << std::endl;
+        std::cout << "m->num_q_heads: " << m->num_q_heads << std::endl;
+        std::cout << "m->qProjSize: " << m->qProjSize << std::endl;
+        std::cout << "BatchConfig::max_sequence_length()"
+                  << BatchConfig::max_sequence_length() << std::endl;
+        std::cout << "sizeof(DT)" << sizeof(DT) << std::endl;
+      }
+      assert(activation_size_needed == m->allocated_peft_buffer_size1);
       int parallelism = m->hidden_size * num_tokens;
       hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache),
                          GET_BLOCKS(parallelism),
@@ -317,18 +325,11 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
     // Copy C_softmax to m->softmax_activation_buffer if we need to compute
     // PEFT backward
     if (bc->requestsInfo[i].finetuning_request) {
-      int max_peft_tokens = BatchConfig::max_finetuning_sequence_length();
-      // Check that we have at most one request that requires peft_bwd
-      assert(bc->num_finetuning_fwd_requests() == 1);
-      assert(bc->num_finetuning_bwd_requests() == 0);
-      assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
-      assert(!is_finetuning_req_bwd_phase(bc, i));
-      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
+      int max_peft_tokens = BatchConfig::max_sequence_length();
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
       size_t activation_size_needed =
           sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
       assert(activation_size_needed == m->allocated_peft_buffer_size2);
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
       checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer,
                                C_softmax,
                                sizeof(DT) * total_tokens * num_new_tokens *
@@ -336,6 +337,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                hipMemcpyDeviceToDevice,
                                stream));
     }
+
     // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
     // softmax(QK.T/sqrt(d_k)).T
     {
@@ -993,11 +995,11 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
   size_t qkv_proj_size =
       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
-  hipMemcpyAsync(m->devQKVProjArray,
-                 qkv_ptr,
-                 qkv_proj_size * sizeof(DT),
-                 hipMemcpyDeviceToDevice,
-                 stream);
+  checkCUDA(hipMemcpyAsync(m->devQKVProjArray,
+                           qkv_ptr,
+                           qkv_proj_size * sizeof(DT),
+                           hipMemcpyDeviceToDevice,
+                           stream));
 
   // phase 1: Implement kernel to apply rotary embedding and scaling
   apply_scaling_and_rotary(
@@ -1015,13 +1017,27 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
     compute_attention_kernel_prompt<DT>(m, bc, shard_id, stream);
   }
 
-  // compute output production and bias together for all tokens
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    assert(m->peft_token_infos != nullptr);
+    assert(m->peft_token_infos_size == sizeof(BatchConfig::PerTokenInfo) *
+                                           BatchConfig::max_sequence_length());
+    int num_ft_tokens = bc->num_finetuning_fwd_tokens();
+    int i = bc->finetuning_request_index();
+    int tokens_previous_requests =
+        bc->requestsInfo[i].first_token_offset_in_batch;
+    int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request;
+    for (int j = 0; j < num_ft_tokens; j++) {
+      m->peft_token_infos[prev_steps_tokens + j] =
+          bc->tokensInfo[tokens_previous_requests + j];
+    }
+  }
+
   int num_tokens = bc->num_active_tokens();
-  hipMemcpyAsync(output_ptr,
-                 m->attn_heads,
-                 m->oProjSize * num_tokens * sizeof(DT),
-                 hipMemcpyDeviceToDevice,
-                 stream);
+  checkCUDA(hipMemcpyAsync(output_ptr,
+                           m->attn_heads,
+                           m->oProjSize * num_tokens * sizeof(DT),
+                           hipMemcpyDeviceToDevice,
+                           stream));
 }
 
 std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
@@ -1120,32 +1136,33 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(
       bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
   int i = bc->finetuning_request_index();
-
   int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
   int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                          bc->requestsInfo[i].num_tokens_in_batch;
   // Currently assume we are calculating gradients for all tokens
   // of a request
   assert(num_tokens == num_total_tokens);
+  assert(num_total_tokens == bc->requestsInfo[i].max_length);
+  assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
   int kt_block_size = m->kProjSize;
-  int kt_req_block_size = kt_block_size * m->num_q_heads *
-                          BatchConfig::max_finetuning_sequence_length();
+  int kt_req_block_size =
+      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
   int vt_block_size = m->vProjSize;
-  int vt_req_block_size = vt_block_size * m->num_q_heads *
-                          BatchConfig::max_finetuning_sequence_length();
-  assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+  int vt_req_block_size =
+      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+
   // Step 1: copy gradient before final projection into workspace
   {
     int m_ = m->vProjSize * m->num_q_heads;
     int n_ = num_tokens;
     DT *C = static_cast<DT *>(m->handle.workSpace);
-    hipMemcpyAsync(C,
-                   output_grad_ptr +
-                       bc->requestsInfo[i].first_token_offset_in_batch *
-                           m->oProjSize,
-                   m_ * n_ * sizeof(DT),
-                   hipMemcpyDeviceToDevice,
-                   stream);
+    checkCUDA(hipMemcpyAsync(
+        C,
+        output_grad_ptr +
+            bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize,
+        m_ * n_ * sizeof(DT),
+        hipMemcpyDeviceToDevice,
+        stream));
     if (m->inference_debugging) {
       // save result to file for checking
       std::string filename =
@@ -1446,6 +1463,11 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   // Step 7: perform rotary position embeddings (RoPE) bwd
   {
     if (m->rotary_embedding_meta->apply_rotary_embedding) {
+      checkCUDA(hipMemcpyAsync(m->peft_token_infos_device,
+                               m->peft_token_infos,
+                               m->peft_token_infos_size,
+                               hipMemcpyHostToDevice,
+                               stream));
       assert(m->hidden_size == m->qProjSize * m->num_q_heads);
       assert(m->qProjSize == m->kProjSize);
       /*q&k*/
@@ -1459,7 +1481,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
           stream,
           A,
           m->complex_input,
-          m->token_infos,
+          m->peft_token_infos_device,
           m->rotary_embedding_meta->rope_theta,
           (m->rotary_embedding_meta->rope_type == "llama3"),
           m->rotary_embedding_meta->factor,
@@ -1752,24 +1774,34 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
                                                    kProjSize * num_q_heads)) /
                           2;
-    allocated_peft_buffer_size1 =
-        enable_peft_finetuning
-            ? (BatchConfig::max_finetuning_sequence_length() * num_q_heads *
-               qProjSize * size_of_dt)
-            : 0;
-    allocated_peft_buffer_size2 =
-        enable_peft_finetuning
-            ? (BatchConfig::max_finetuning_sequence_length() *
-               BatchConfig::max_finetuning_sequence_length() * num_q_heads *
-               size_of_dt)
-            : 0;
+    if (enable_peft_finetuning) {
+      allocated_peft_buffer_size1 = BatchConfig::max_sequence_length() *
+                                    num_q_heads * qProjSize * size_of_dt;
+      allocated_peft_buffer_size2 = BatchConfig::max_sequence_length() *
+                                    BatchConfig::max_sequence_length() *
+                                    num_q_heads * size_of_dt;
+      peft_token_infos = (BatchConfig::PerTokenInfo *)calloc(
+          1,
+          sizeof(BatchConfig::PerTokenInfo) *
+              BatchConfig::max_sequence_length());
+      peft_token_infos_size = sizeof(BatchConfig::PerTokenInfo) *
+                              BatchConfig::max_sequence_length();
+    } else {
+      allocated_peft_buffer_size1 = 0;
+      allocated_peft_buffer_size2 = 0;
+      peft_token_infos = nullptr;
+      peft_token_infos_size = 0;
+    }
     size_t totalSize =
         (qkv_max_proj_size + key_cache_size + value_cache_size +
          2 * qk_prod_size + attn_heads_size) *
             size_of_dt +
         complex_size * sizeof(hipFloatComplex); // more components will
                                                 // be added here later
-    totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2;
+    if (enable_peft_finetuning) {
+      totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2;
+      totalSize += peft_token_infos_size;
+    }
     if (offload) {
       // assert that we have enough reserved work space left
       size_t totalSharedSize =
@@ -1841,6 +1873,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
           allocated_peft_buffer_size1);
       softmax_activation_buffer = gpu_mem_allocator.allocate_instance_untyped(
           allocated_peft_buffer_size2);
+      peft_token_infos_device = (BatchConfig::PerTokenInfo *)
+                                    gpu_mem_allocator.allocate_instance_untyped(
+                                        peft_token_infos_size);
     }
 
     // allocate more size for quantization data
@@ -1852,8 +1887,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
-  allocated_peft_buffer_size1 = 0;
-  allocated_peft_buffer_size2 = 0;
+
   checkCUDA(hipStreamSynchronize(stream));
 }
 
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 35dd27d36..c2aebcac6 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -1927,10 +1927,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 
   // Step 7: perform rotary position embeddings (RoPE) bwd
-  // todo: first sum the gradients wrt each q_head to obtain the gradients wrt
-  // each key head
   {
     if (m->rotary_embedding_meta->apply_rotary_embedding) {
+      checkCUDA(cudaMemcpyAsync(m->peft_token_infos_device,
+                                m->peft_token_infos,
+                                m->peft_token_infos_size,
+                                cudaMemcpyHostToDevice,
+                                stream));
       assert(m->hidden_size == m->qProjSize * m->num_q_heads);
       assert(m->qProjSize == m->kProjSize);
       /*q&k*/
@@ -1942,7 +1945,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                    stream>>>(
           A,
           m->complex_input,
-          m->token_infos,
+          m->peft_token_infos_device,
           m->rotary_embedding_meta->rope_theta,
           (m->rotary_embedding_meta->rope_type == "llama3"),
           m->rotary_embedding_meta->factor,
@@ -2257,15 +2260,23 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                           2;
     if (enable_peft_finetuning) {
       allocated_peft_buffer_size1 =
-          BatchConfig::max_finetuning_sequence_length() * num_q_heads *
+          BatchConfig::max_sequence_length() * num_q_heads *
           qProjSize * size_of_dt;
       allocated_peft_buffer_size2 =
-          BatchConfig::max_finetuning_sequence_length() *
-          BatchConfig::max_finetuning_sequence_length() * num_q_heads *
+          BatchConfig::max_sequence_length() *
+          BatchConfig::max_sequence_length() * num_q_heads *
           size_of_dt;
+      peft_token_infos = (BatchConfig::PerTokenInfo *)calloc(
+          1,
+          sizeof(BatchConfig::PerTokenInfo) *
+              BatchConfig::max_sequence_length());
+      peft_token_infos_size = sizeof(BatchConfig::PerTokenInfo) *
+                              BatchConfig::max_sequence_length();
     } else {
       allocated_peft_buffer_size1 = 0;
       allocated_peft_buffer_size2 = 0;
+      peft_token_infos = nullptr;
+      peft_token_infos_size = 0;
     }
     size_t totalSize = (qkv_max_proj_size + query_tmp_size + key_cache_size +
                         value_cache_size + 2 * qk_prod_size + attn_heads_size +
@@ -2275,6 +2286,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                        3 * gqa_ptr_array_size;
     if (enable_peft_finetuning) {
       totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2;
+      totalSize += peft_token_infos_size;
       totalSize += 3 * gqa_ptr_array_size;
     }
     if (offload) {
@@ -2382,6 +2394,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
           allocated_peft_buffer_size1);
       softmax_activation_buffer = gpu_mem_allocator.allocate_instance_untyped(
           allocated_peft_buffer_size2);
+      peft_token_infos_device = (BatchConfig::PerTokenInfo *)
+                                    gpu_mem_allocator.allocate_instance_untyped(
+                                        peft_token_infos_size);
     }
 
     // allocate more size for quantization data
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 2111dff50..1d46dcf90 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -40,11 +40,24 @@ LinearMeta::LinearMeta(FFHandler handler,
           gpu_mem_allocator.allocate_reserved<char>(quantized_weightSize);
     }
   }
-  // Allocate an all-one's vector
+  // peft activation
+  size_t out_dim =
+      li->outputs[0]->dims[0].size / li->outputs[0]->dims[0].degree;
+  allocated_peft_buffer_size =
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * out_dim)
+                             : 0;
+  size_t totalSize =
+      data_type_size(data_type) * batch_size + allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
-      reserveInst, data_type_size(data_type) * batch_size, "LinearMeta");
+      reserveInst, totalSize, "LinearMeta");
+  // Allocate an all-one's vector
   one_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * batch_size);
+  if (enable_peft_finetuning) {
+    output_activation_buffer =
+        gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
+  }
   int parallelism = batch_size;
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -211,6 +224,11 @@ void inference_kernel_wrapper(LinearMeta *m,
                                     out_dim,
                                     batch_size,
                                     stream);
+    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
+        bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations<float>(
+          m, bc, out_dim, static_cast<float *>(output_ptr), stream);
+    }
   } else if (m->input_type[0] == DT_HALF) {
     Internal::forward_kernel<half>(m,
                                    input_ptr,
@@ -221,41 +239,10 @@ void inference_kernel_wrapper(LinearMeta *m,
                                    out_dim,
                                    batch_size,
                                    stream);
-  }
-
-  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
-    // save input activation if needed for PEFT
-    if (bc->num_finetuning_fwd_requests() > 0) {
-      // Check that we have at most one request that requires peft_bwd
-      assert(bc->num_finetuning_fwd_tokens() >= 1);
-      int i = bc->finetuning_request_index();
-      assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
-      assert(!bc->requestsInfo[i].finetuning_backward_phase);
-      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-      assert(m->allocated_peft_buffer_size ==
-             data_type_size(m->input_type[0]) *
-                 BatchConfig::max_finetuning_sequence_length() * in_dim);
-      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-      assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
-      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-      // copy input activation
-      if (m->input_type[0] == DT_FLOAT) {
-        checkCUDA(hipMemcpyAsync(
-            m->input_activation,
-            added_output.get_float_ptr() + first_token_offset * in_dim,
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-            hipMemcpyDeviceToDevice,
-            stream));
-      } else if (m->input_type[0] == DT_HALF) {
-        checkCUDA(hipMemcpyAsync(
-            m->input_activation,
-            added_output.get_half_ptr() + first_token_offset * in_dim,
-            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-            hipMemcpyDeviceToDevice,
-            stream));
-      } else {
-        assert(false && "unsupport datatype in layernorm");
-      }
+    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
+        bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations<half>(
+          m, bc, out_dim, static_cast<half *>(output_ptr), stream);
     }
   }
 
@@ -276,9 +263,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
                              void *output_grad_ptr,
                              void const *weight_ptr,
                              int in_dim,
-                             int out_dim,
-                             int num_infr_tokens,
-                             int num_peft_tokens) {
+                             int out_dim) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
@@ -295,8 +280,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
                                      weight_ptr,
                                      in_dim,
                                      out_dim,
-                                     num_infr_tokens,
-                                     num_peft_tokens,
                                      stream);
   } else if (m->input_type[0] == DT_HALF) {
     Internal::peft_bwd_kernel<half>(m,
@@ -306,8 +289,6 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
                                     weight_ptr,
                                     in_dim,
                                     out_dim,
-                                    num_infr_tokens,
-                                    num_peft_tokens,
                                     stream);
   }
 
@@ -561,6 +542,34 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void store_peft_activations(LinearMeta const *m,
+                            BatchConfig const *bc,
+                            size_t out_dim,
+                            DT *output_ptr,
+                            hipStream_t stream) {
+  int i = bc->finetuning_request_index();
+  int num_ft_tokens = bc->num_finetuning_fwd_tokens();
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
+  int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch;
+  size_t data_size = out_dim * num_ft_tokens * sizeof(DT);
+  size_t batch_offset = out_dim * tokens_previous_requests;
+  size_t request_offset = out_dim * tokens_previous_steps;
+  assert(bc->num_finetuning_fwd_tokens() >= 1);
+  assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
+  assert(!bc->requestsInfo[i].finetuning_backward_phase);
+  assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
+  assert(m->allocated_peft_buffer_size >= data_size);
+
+  checkCUDA(hipMemcpyAsync(static_cast<DT *>(m->output_activation_buffer) +
+                               request_offset,
+                           output_ptr + batch_offset,
+                           data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
 template <typename DT>
 void peft_bwd_kernel(LinearMeta const *m,
                      BatchConfig const *bc,
@@ -569,21 +578,22 @@ void peft_bwd_kernel(LinearMeta const *m,
                      void const *kernel_ptr,
                      int in_dim,
                      int out_dim,
-                     int num_infr_tokens,
-                     int num_peft_tokens,
                      ffStream_t stream) {
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
 
+  assert(
+      bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
+  int i = bc->finetuning_request_index();
+  int num_peft_tokens = bc->num_finetuning_bwd_tokens();
+  assert(bc->num_finetuning_bwd_requests() == 1);
+
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-  // update input_grad_ptr and output_grad_ptr offset
-  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
-  input_grad_ptr =
-      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
-  output_grad_ptr =
-      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
+  input_grad_ptr = static_cast<DT *>(input_grad_ptr);
+  output_grad_ptr = static_cast<DT *>(output_grad_ptr);
+
   hipblasDatatype_t compute_type = output_type;
   int output_size = out_dim * num_peft_tokens;
   if (m->activation == AC_MODE_RELU) {
@@ -607,31 +617,13 @@ void peft_bwd_kernel(LinearMeta const *m,
   // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
   DT alpha = 1.0f;
   DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
-
-  // ensure that we only have one finetuning request, with a single lora
-  int num_peft_requests = 0;
-  bool lora_applies = false;
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] ||
-        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
-        !bc->requestsInfo[i].peft_bwd) {
-      continue;
-    }
-    num_peft_requests++;
-    std::string peft_model_config_str =
-        std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
-    if (!lora_applies_to_this_layer(m, lora_config)) {
-      continue;
-    }
-    lora_applies = true;
-  }
-  assert(num_peft_requests == 1 &&
-         "Exactly one PEFT finetuning request is required");
+  std::string peft_model_config_str =
+      std::string(bc->requestsInfo[i].peft_model_config_str);
+  LoraLinearConfig lora_config =
+      LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+  bool lora_applies = lora_applies_to_this_layer(m, lora_config);
   // if the request does not have any active lora in the current layer, reset
-  // beta to 0 std::cout << m->op_name << " original beta: " << (float)beta << "
-  // lora_applies: " << lora_applies << std::endl;
+  // beta to 0
   if (lora_applies) {
     beta = 1.0f;
   }
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index 0df5ab723..b0d76a958 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -43,10 +43,9 @@ LinearMeta::LinearMeta(FFHandler handler,
   size_t out_dim =
       li->outputs[0]->dims[0].size / li->outputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * out_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * out_dim)
+                             : 0;
   size_t totalSize =
       data_type_size(data_type) * batch_size + allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -226,8 +225,10 @@ void inference_kernel_wrapper(LinearMeta *m,
                                     out_dim,
                                     batch_size,
                                     stream);
-    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && bc->num_finetuning_fwd_requests() > 0) {
-      Internal::store_peft_activations<float>(m, bc, out_dim, static_cast<float *>(output_ptr), stream);
+    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
+        bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations<float>(
+          m, bc, out_dim, static_cast<float *>(output_ptr), stream);
     }
   } else if (m->input_type[0] == DT_HALF) {
     Internal::forward_kernel<half>(m,
@@ -239,8 +240,10 @@ void inference_kernel_wrapper(LinearMeta *m,
                                    out_dim,
                                    batch_size,
                                    stream);
-    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) && bc->num_finetuning_fwd_requests() > 0) {
-      Internal::store_peft_activations<half>(m, bc, out_dim, static_cast<half *>(output_ptr), stream);
+    if ((m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) &&
+        bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations<half>(
+          m, bc, out_dim, static_cast<half *>(output_ptr), stream);
     }
   }
 
@@ -537,7 +540,8 @@ void store_peft_activations(LinearMeta const *m,
                             cudaStream_t stream) {
   int i = bc->finetuning_request_index();
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
-  int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch;
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
   int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch;
   size_t data_size = out_dim * num_ft_tokens * sizeof(DT);
   size_t batch_offset = out_dim * tokens_previous_requests;
@@ -547,8 +551,9 @@ void store_peft_activations(LinearMeta const *m,
   assert(!bc->requestsInfo[i].finetuning_backward_phase);
   assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
   assert(m->allocated_peft_buffer_size >= data_size);
-  
-  checkCUDA(cudaMemcpyAsync(static_cast<DT*>(m->output_activation_buffer) + request_offset,
+
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(m->output_activation_buffer) +
+                                request_offset,
                             output_ptr + batch_offset,
                             data_size,
                             cudaMemcpyDeviceToDevice,
@@ -576,7 +581,6 @@ void peft_bwd_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-  
   input_grad_ptr = static_cast<DT *>(input_grad_ptr);
   output_grad_ptr = static_cast<DT *>(output_grad_ptr);
   cudaDataType_t compute_type = output_type;
diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp
index 2837ad044..cadde8233 100644
--- a/src/ops/kernels/lora_linear_kernels.cpp
+++ b/src/ops/kernels/lora_linear_kernels.cpp
@@ -194,7 +194,7 @@ void inference_kernel(LoraLinearMeta *m,
         bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
       continue;
     }
-    if (bc->requestsInfo[i].peft_bwd) {
+    if (bc->requestsInfo[i].finetuning_request) {
       num_peft_requests++;
     }
     std::string peft_model_config_str =
@@ -206,17 +206,18 @@ void inference_kernel(LoraLinearMeta *m,
     }
     // std::cout << "Lora layer activated!" << std::endl;
     // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
-    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
+    assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request &&
            "Trainable flag mismatch");
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     // int max_peft_tokens = bc->requestsInfo[i].max_length;
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     LoraLinearWeight weight = m->peft_memory_manager->get_peft(
         bc->requestsInfo[i].peft_model_id, lora_config);
-    void *intermediate_result_ptr = (bc->requestsInfo[i].peft_bwd)
+    void *intermediate_result_ptr = (bc->requestsInfo[i].finetuning_request)
                                         ? weight.low_rank_activation
                                         : m->handle.workSpace;
-    if (bc->requestsInfo[i].peft_bwd) {
+    if (bc->requestsInfo[i].finetuning_request) {
       checkCUDA(hipMemcpyAsync(weight.input_activation,
                                input_ptr + first_token_offset * in_dim,
                                data_type_size(m->input_type[0]) *
@@ -322,151 +323,50 @@ void peft_bwd_kernel(Context ctx,
   hipblasDatatype_t lr_actv_type = output_type;
   hipblasDatatype_t compute_type = output_type;
 
-  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    // Skip completed, non-PEFT and PEFT forward-only requests
-    if (bc->request_completed[i] ||
-        bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID ||
-        !bc->requestsInfo[i].peft_bwd) {
-      continue;
-    }
-    std::string peft_model_config_str =
-        std::string(bc->requestsInfo[i].peft_model_config_str);
-    LoraLinearConfig lora_config =
-        LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
-    if (!lora_applies_to_this_layer(m, lora_config)) {
-      continue;
-    }
-    // std::cout << "Lora layer activated!" << std::endl;
-    // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
-    assert(lora_config.trainable == bc->requestsInfo[i].peft_bwd &&
-           "Trainable flag mismatch");
-    m->peft_memory_manager->check_ft_model_id(
-        bc->requestsInfo[i].peft_model_id);
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    // int max_peft_tokens = bc->requestsInfo[i].max_length;
-    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    LoraLinearWeight weight = m->peft_memory_manager->get_peft(
-        bc->requestsInfo[i].peft_model_id, lora_config);
-    DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
-
-    // Compute LORA_B weight's gradient
-    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
-      DT alpha = 1.0f;
-      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
-                    ? 0.0f
-                    : 1.0f;
-      // std::cout << "Lora B gradient computation, beta = " << (float) beta <<
-      // std::endl;
-      if (m->inference_debugging) {
-        // save result to file for checking
-        std::string filename =
-            get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
-        std::cout << "Save low_rank_activation (" << lora_config.rank << ", "
-                  << num_peft_tokens << ") to " << filename << std::endl;
-        save_tensor(static_cast<const DT *>(weight.low_rank_activation),
-                    lora_config.rank * num_peft_tokens,
-                    filename.c_str());
-      }
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_T,
-                              lora_config.rank,
-                              out_dim,
-                              num_peft_tokens,
-                              &scaling_constant,
-                              weight.low_rank_activation,
-                              lr_actv_type,
-                              lora_config.rank,
-                              output_grad_ptr,
-                              output_type,
-                              out_dim,
-                              &beta,
-                              weight.w1_grad_ptr,
-                              weight_type,
-                              lora_config.rank,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
-    }
-
-    // Compute LORA_B input's (and LORA_A output's) gradient inplace in
-    // low_rank_activation
-    {
-      DT alpha = 1.0f, beta = 0.0f;
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_N,
-                              lora_config.rank,
-                              num_peft_tokens,
-                              out_dim,
-                              &scaling_constant,
-                              weight.w1_ptr,
-                              weight_type,
-                              lora_config.rank,
-                              output_grad_ptr,
-                              output_type,
-                              out_dim,
-                              &beta,
-                              weight.low_rank_activation,
-                              lr_actv_type,
-                              lora_config.rank,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
-    }
+  assert(
+      bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
+  int i = bc->finetuning_request_index();
+  std::string peft_model_config_str =
+      std::string(bc->requestsInfo[i].peft_model_config_str);
+  LoraLinearConfig lora_config =
+      LoraLinearConfig::deserialize_from_json_string(peft_model_config_str);
+  if (!lora_applies_to_this_layer(m, lora_config)) {
+    return;
+  }
+  // std::cout << "Lora layer activated!" << std::endl;
+  // std::cout << "Lora Config: " << peft_model_config_str << std::endl;
+  assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request &&
+         "Trainable flag mismatch");
+  m->peft_memory_manager->check_ft_model_id(bc->requestsInfo[i].peft_model_id);
+  int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+  assert(num_peft_tokens == bc->num_finetuning_bwd_tokens());
+  // int max_peft_tokens = bc->requestsInfo[i].max_length;
+  // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+  LoraLinearWeight weight = m->peft_memory_manager->get_peft(
+      bc->requestsInfo[i].peft_model_id, lora_config);
+  DT scaling_constant = (DT)(lora_config.lora_alpha / lora_config.rank);
 
-    // Compute LORA_A weight's gradient
-    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
-      DT alpha = 1.0f;
-      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
-                    ? 0.0f
-                    : 1.0f;
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_T,
-                              in_dim,
-                              lora_config.rank,
-                              num_peft_tokens,
-                              &alpha,
-                              weight.input_activation,
-                              input_type,
-                              in_dim,
-                              weight.low_rank_activation,
-                              lr_actv_type,
-                              lora_config.rank,
-                              &beta,
-                              weight.w0_grad_ptr,
-                              weight_type,
-                              in_dim,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
-    }
-    // Compute input gradient
-    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
-    if (input_grad_ptr != nullptr) {
-      DT alpha = 1.0f;
-      DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_N,
-                              HIPBLAS_OP_N,
-                              in_dim,
-                              num_peft_tokens,
-                              lora_config.rank,
-                              &alpha,
-                              weight.w0_ptr,
-                              weight_type,
-                              in_dim,
-                              weight.low_rank_activation,
-                              lr_actv_type,
-                              lora_config.rank,
-                              &beta,
-                              input_grad_ptr,
-                              input_type,
-                              in_dim,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
+  // Compute LORA_B weight's gradient
+  if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+    DT alpha = 1.0f;
+    DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                  ? 0.0f
+                  : 1.0f;
+    // std::cout << "Lora B gradient computation, beta = " << (float) beta <<
+    // std::endl;
+    if (m->inference_debugging) {
+      // save result to file for checking
+      std::string filename =
+          get_peft_dbg_folder(m, shard_id, false) + ".low_rank_activation";
+      std::cout << "Save low_rank_activation (" << lora_config.rank << ", "
+                << num_peft_tokens << ") to " << filename << std::endl;
+      save_tensor(static_cast<const DT *>(weight.low_rank_activation),
+                  lora_config.rank * num_peft_tokens,
+                  filename.c_str());
     }
     checkCUDA(hipblasGemmEx(m->handle.blas,
-                            CUBLAS_OP_N,
-                            CUBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_T,
                             lora_config.rank,
                             out_dim,
                             num_peft_tokens,
@@ -485,6 +385,83 @@ void peft_bwd_kernel(Context ctx,
                             HIPBLAS_GEMM_DEFAULT));
   }
 
+  // Compute LORA_B input's (and LORA_A output's) gradient inplace in
+  // low_rank_activation
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_N,
+                            lora_config.rank,
+                            num_peft_tokens,
+                            out_dim,
+                            &scaling_constant,
+                            weight.w1_ptr,
+                            weight_type,
+                            lora_config.rank,
+                            output_grad_ptr,
+                            output_type,
+                            out_dim,
+                            &beta,
+                            weight.low_rank_activation,
+                            lr_actv_type,
+                            lora_config.rank,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+
+  // Compute LORA_A weight's gradient
+  if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+    DT alpha = 1.0f;
+    DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                  ? 0.0f
+                  : 1.0f;
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_T,
+                            in_dim,
+                            lora_config.rank,
+                            num_peft_tokens,
+                            &alpha,
+                            weight.input_activation,
+                            input_type,
+                            in_dim,
+                            weight.low_rank_activation,
+                            lr_actv_type,
+                            lora_config.rank,
+                            &beta,
+                            weight.w0_grad_ptr,
+                            weight_type,
+                            in_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+  // Compute input gradient
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  if (input_grad_ptr != nullptr) {
+    DT alpha = 1.0f;
+    DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_N,
+                            in_dim,
+                            num_peft_tokens,
+                            lora_config.rank,
+                            &alpha,
+                            weight.w0_ptr,
+                            weight_type,
+                            in_dim,
+                            weight.low_rank_activation,
+                            lr_actv_type,
+                            lora_config.rank,
+                            &beta,
+                            input_grad_ptr,
+                            input_type,
+                            in_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+
   if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
     assert(lora_config.optimizer_config != nullptr);
     int w0_num_elements = lora_config.rank * in_dim;
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 8e262a78c..d5a0c5c9d 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -39,14 +39,24 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   DataType data_type = rms->weights[0]->data_type;
   size_t rms_ptr_size = batch_size;
   size_t norm_ptr_size = num_elements;
-  size_t totalSize = (rms_ptr_size + norm_ptr_size) * data_type_size(data_type);
+  size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree;
+  allocated_peft_buffer_size =
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
+  size_t totalSize =
+      (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) +
+      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
       reserveInst, totalSize, "ResidualRMSNormMeta");
   rms_ptr = gpu_mem_allocator.allocate_instance_untyped(
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
-  allocated_peft_buffer_size = 0;
+  if (enable_peft_finetuning) {
+    input_activation =
+        gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
+  }
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -206,6 +216,35 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+template <typename DT>
+void store_peft_activations(ResidualRMSNormMeta const *m,
+                            BatchConfig const *bc,
+                            size_t in_dim,
+                            DT *residual_output_ptr,
+                            hipStream_t stream) {
+  assert(m->enable_peft_finetuning);
+  assert(bc->num_finetuning_fwd_tokens() >= 1);
+
+  int num_ft_tokens = bc->num_finetuning_fwd_tokens();
+  int i = bc->finetuning_request_index();
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
+  int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch;
+  assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
+
+  size_t batch_offset = in_dim * tokens_previous_requests;
+  size_t request_offset = in_dim * tokens_previous_steps;
+  size_t data_size = in_dim * num_ft_tokens * sizeof(DT);
+  assert(m->allocated_peft_buffer_size >= data_size);
+
+  checkCUDA(
+      hipMemcpyAsync(static_cast<DT *>(m->input_activation) + request_offset,
+                     residual_output_ptr + batch_offset,
+                     data_size,
+                     hipMemcpyDeviceToDevice,
+                     stream));
+}
+
 void inference_kernel_wrapper(ResidualRMSNormMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input1,
@@ -226,6 +265,7 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
   assert(output.data_type == input1.data_type);
   assert(weight.data_type == output.data_type);
   assert(residual_output.data_type == output.data_type);
+  int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
 
   if (output.data_type == DT_HALF) {
     forward_kernel(m,
@@ -235,6 +275,10 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
                    residual_output.get_half_ptr(),
                    output.get_half_ptr(),
                    stream);
+    if (bc->num_finetuning_fwd_requests() > 0) {
+      store_peft_activations(
+          m, bc, in_dim, residual_output.get_half_ptr(), stream);
+    }
   } else if (output.data_type == DT_FLOAT) {
     forward_kernel(m,
                    input1.get_float_ptr(),
@@ -243,45 +287,14 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
                    residual_output.get_float_ptr(),
                    output.get_float_ptr(),
                    stream);
+    if (bc->num_finetuning_fwd_requests() > 0) {
+      store_peft_activations(
+          m, bc, in_dim, residual_output.get_float_ptr(), stream);
+    }
   } else {
     assert(false && "Unsupported data type");
   }
 
-  // save input activation if needed for PEFT. This must be done after the
-  // forward kernel since that's where we add the residual
-  if (bc->num_finetuning_fwd_requests() > 0) {
-    // Check that we have at most one request that requires peft_bwd
-    assert(bc->num_finetuning_fwd_tokens() >= 1);
-    int i = bc->finetuning_request_index();
-    assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
-    assert(!bc->requestsInfo[i].finetuning_backward_phase);
-    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-    assert(m->allocated_peft_buffer_size ==
-           data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
-    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
-    assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
-    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
-    // copy input activation
-    if (m->input_type[0] == DT_FLOAT) {
-      checkCUDA(hipMemcpyAsync(
-          m->input_activation,
-          added_output.get_float_ptr() + first_token_offset * in_dim,
-          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-          hipMemcpyDeviceToDevice,
-          stream));
-    } else if (m->input_type[0] == DT_HALF) {
-      checkCUDA(hipMemcpyAsync(
-          m->input_activation,
-          added_output.get_half_ptr() + first_token_offset * in_dim,
-          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
-          hipMemcpyDeviceToDevice,
-          stream));
-    } else {
-      assert(false && "unsupport datatype in layernorm");
-    }
-  }
-
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index f952f0727..a576bad33 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -41,10 +41,9 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
   size_t norm_ptr_size = num_elements;
   size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize =
       (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) +
       allocated_peft_buffer_size;
@@ -222,23 +221,25 @@ void store_peft_activations(ResidualRMSNormMeta const *m,
                             cudaStream_t stream) {
   assert(m->enable_peft_finetuning);
   assert(bc->num_finetuning_fwd_tokens() >= 1);
-  
+
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
   int i = bc->finetuning_request_index();
-  int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch;
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
   int tokens_previous_steps = bc->requestsInfo[i].first_token_offset_in_batch;
   assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
-  
+
   size_t batch_offset = in_dim * tokens_previous_requests;
   size_t request_offset = in_dim * tokens_previous_steps;
   size_t data_size = in_dim * num_ft_tokens * sizeof(DT);
   assert(m->allocated_peft_buffer_size >= data_size);
-  
-  checkCUDA(cudaMemcpyAsync(static_cast<DT*>(m->input_activation) + request_offset,
-                            residual_output_ptr + batch_offset,
-                            data_size,
-                            cudaMemcpyDeviceToDevice,
-                            stream));
+
+  checkCUDA(
+      cudaMemcpyAsync(static_cast<DT *>(m->input_activation) + request_offset,
+                      residual_output_ptr + batch_offset,
+                      data_size,
+                      cudaMemcpyDeviceToDevice,
+                      stream));
 }
 
 void inference_kernel_wrapper(ResidualRMSNormMeta *m,
@@ -272,7 +273,8 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
                    output.get_half_ptr(),
                    stream);
     if (bc->num_finetuning_fwd_requests() > 0) {
-      store_peft_activations(m, bc, in_dim, residual_output.get_half_ptr(), stream);
+      store_peft_activations(
+          m, bc, in_dim, residual_output.get_half_ptr(), stream);
     }
   } else if (output.data_type == DT_FLOAT) {
     forward_kernel(m,
@@ -283,14 +285,13 @@ void inference_kernel_wrapper(ResidualRMSNormMeta *m,
                    output.get_float_ptr(),
                    stream);
     if (bc->num_finetuning_fwd_requests() > 0) {
-      store_peft_activations(m, bc, in_dim, residual_output.get_float_ptr(), stream);
+      store_peft_activations(
+          m, bc, in_dim, residual_output.get_float_ptr(), stream);
     }
   } else {
     assert(false && "Unsupported data type");
   }
 
-  
-
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 1bc33b26d..503613923 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -22,6 +22,7 @@
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
+
 #define C10_WARP_SIZE 32
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
@@ -32,6 +33,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
+  enable_peft_finetuning = rms->enable_peft_finetuning;
   num_elements = in_dim * batch_size;
 
   DataType data_type = rms->weights[0]->data_type;
@@ -39,10 +41,9 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t norm_ptr_size = num_elements;
   size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize =
       (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) +
       allocated_peft_buffer_size;
@@ -222,25 +223,39 @@ void inference_kernel_wrapper(RMSNormMeta *m,
     assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
     assert(!bc->requestsInfo[i].finetuning_backward_phase);
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+    if (m->allocated_peft_buffer_size !=
+        data_type_size(m->input_type[0]) * BatchConfig::max_sequence_length() *
+            in_dim) {
+      std::cout << "allocated_peft_buffer_size = "
+                << m->allocated_peft_buffer_size << ", expected = "
+                << data_type_size(m->input_type[0]) *
+                       BatchConfig::max_sequence_length() * in_dim
+                << std::endl;
+      std::cout << "in_dim = " << in_dim << std::endl;
+      std::cout << "max_sequence_length = "
+                << BatchConfig::max_sequence_length() << std::endl;
+      std::cout << "data_type_size = " << data_type_size(m->input_type[0])
+                << std::endl;
+    }
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     // copy input activation
-    if (m->input_type[0] == DT_FLOAT) {
+    if (input.data_type == DT_FLOAT) {
       checkCUDA(hipMemcpyAsync(
           m->input_activation,
-          added_output.get_float_ptr() + first_token_offset * in_dim,
-          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+          input.get_float_ptr() + first_token_offset * in_dim,
+          data_type_size(input.data_type) * num_peft_tokens * in_dim,
           hipMemcpyDeviceToDevice,
           stream));
-    } else if (m->input_type[0] == DT_HALF) {
+    } else if (input.data_type == DT_HALF) {
       checkCUDA(hipMemcpyAsync(
           m->input_activation,
-          added_output.get_half_ptr() + first_token_offset * in_dim,
-          data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+          input.get_half_ptr() + first_token_offset * in_dim,
+          data_type_size(input.data_type) * num_peft_tokens * in_dim,
           hipMemcpyDeviceToDevice,
           stream));
     } else {
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 4fba94945..928770616 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -41,10 +41,9 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
   size_t norm_ptr_size = num_elements;
   size_t in_dim = rms->inputs[0]->dims[0].size / rms->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize =
       (rms_ptr_size + norm_ptr_size) * data_type_size(data_type) +
       allocated_peft_buffer_size;
@@ -222,22 +221,22 @@ void inference_kernel_wrapper(RMSNormMeta *m,
     assert(!bc->requestsInfo[i].finetuning_backward_phase);
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     if (m->allocated_peft_buffer_size !=
-        data_type_size(m->input_type[0]) *
-            BatchConfig::max_finetuning_sequence_length() * in_dim) {
+        data_type_size(m->input_type[0]) * BatchConfig::max_sequence_length() *
+            in_dim) {
       std::cout << "allocated_peft_buffer_size = "
                 << m->allocated_peft_buffer_size << ", expected = "
                 << data_type_size(m->input_type[0]) *
-                       BatchConfig::max_finetuning_sequence_length() * in_dim
+                       BatchConfig::max_sequence_length() * in_dim
                 << std::endl;
       std::cout << "in_dim = " << in_dim << std::endl;
       std::cout << "max_sequence_length = "
-                << BatchConfig::max_finetuning_sequence_length() << std::endl;
+                << BatchConfig::max_sequence_length() << std::endl;
       std::cout << "data_type_size = " << data_type_size(m->input_type[0])
                 << std::endl;
     }
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index 574547858..bf0d58a05 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -24,7 +24,9 @@ using Legion::Domain;
 
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
-                         Domain const &input_domain)
+                         Domain const &input_domain,
+                         bool is_last_op,
+                         MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, softmax) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
@@ -36,6 +38,17 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
   profiling = softmax->profiling;
   inference_debugging = softmax->inference_debugging;
   enable_peft_finetuning = softmax->enable_peft_finetuning;
+  if (enable_peft_finetuning && is_last_op) {
+    allocated_peft_buffer_size =
+        input_domain.get_volume() * data_type_size(softmax->data_type);
+    gpu_mem_allocator.create_legion_instance(
+        reserveInst, allocated_peft_buffer_size, "SoftmaxMeta");
+    output_grad_ptr =
+        gpu_mem_allocator.allocate_instance_untyped(allocated_peft_buffer_size);
+  } else {
+    allocated_peft_buffer_size = 0;
+    output_grad_ptr = nullptr;
+  }
   std::strcpy(op_name, softmax->name);
 }
 
@@ -120,12 +133,11 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-void inference_kernel_wrapper(SoftmaxMeta const *m,
+void inference_kernel_wrapper(SoftmaxMeta *m,
                               BatchConfig const *bc,
                               bool is_last_op,
                               GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output,
-                              GenericTensorAccessorW const &output_grad) {
+                              GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
@@ -142,12 +154,9 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_float_ptr(),
                                num_classes,
                                stream);
-    if (is_last_op) {
-      checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(),
-                               output.get_float_ptr(),
-                               output.domain.get_volume() * sizeof(float),
-                               hipMemcpyDeviceToDevice,
-                               stream));
+    if (is_last_op && bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations(
+          m, bc, num_classes, output.get_float_ptr(), stream);
     }
   } else if (m->output_type[0] == DT_HALF) {
     Internal::inference_kernel(m,
@@ -156,12 +165,9 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
                                output.get_half_ptr(),
                                num_classes,
                                stream);
-    if (is_last_op) {
-      checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(),
-                               output.get_half_ptr(),
-                               output.domain.get_volume() * sizeof(half),
-                               hipMemcpyDeviceToDevice,
-                               stream));
+    if (is_last_op && bc->num_finetuning_fwd_requests() > 0) {
+      Internal::store_peft_activations(
+          m, bc, num_classes, output.get_half_ptr(), stream);
     }
   } else {
     assert(false && "Unsupported data type");
@@ -183,8 +189,7 @@ void inference_kernel_wrapper(SoftmaxMeta const *m,
 
 void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
                              BatchConfig const *bc,
-                             GenericTensorAccessorW const &input_grad,
-                             GenericTensorAccessorR const &output_grad) {
+                             GenericTensorAccessorW const &input_grad) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
@@ -194,21 +199,13 @@ void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
-  int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_classes = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
   if (m->output_type[0] == DT_FLOAT) {
-    Internal::peft_bwd_kernel(m,
-                              bc,
-                              input_grad.get_float_ptr(),
-                              output_grad.get_float_ptr(),
-                              num_classes,
-                              stream);
+    Internal::peft_bwd_kernel(
+        m, bc, input_grad.get_float_ptr(), num_classes, stream);
   } else if (m->output_type[0] == DT_HALF) {
-    Internal::peft_bwd_kernel(m,
-                              bc,
-                              input_grad.get_half_ptr(),
-                              output_grad.get_half_ptr(),
-                              num_classes,
-                              stream);
+    Internal::peft_bwd_kernel(
+        m, bc, input_grad.get_half_ptr(), num_classes, stream);
   } else {
     assert(false && "Unsupported data type");
   }
@@ -288,6 +285,39 @@ void inference_kernel(SoftmaxMeta const *m,
                                      MIOPEN_SOFTMAX_MODE_CHANNEL));
 }
 
+template <typename DT>
+void store_peft_activations(SoftmaxMeta *m,
+                            BatchConfig const *bc,
+                            int num_classes,
+                            DT *output_ptr,
+                            hipStream_t stream) {
+  assert(m->enable_peft_finetuning);
+  assert(m->output_grad_ptr != nullptr);
+
+  int num_ft_tokens = bc->num_finetuning_fwd_tokens();
+  int i = bc->finetuning_request_index();
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
+  int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request;
+  assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
+
+  // shift labels by 1 position to the left (ignore first token label)
+  for (int j = 0; j < num_ft_tokens - 1; j++) {
+    m->peft_token_ids[prev_steps_tokens + j] =
+        bc->tokensInfo[tokens_previous_requests + j + 1].token_id;
+  }
+
+  size_t batch_offset = num_classes * tokens_previous_requests;
+  size_t req_offset = num_classes * prev_steps_tokens;
+  size_t data_size = num_classes * num_ft_tokens * sizeof(DT);
+  assert(m->allocated_peft_buffer_size >= data_size);
+  checkCUDA(hipMemcpyAsync(static_cast<DT *>(m->output_grad_ptr) + req_offset,
+                           output_ptr + batch_offset,
+                           data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
 template <typename DT>
 __global__ void sparse_categorical_crossentropy_loss_peft_backward(
     DT *input_grad,
@@ -309,32 +339,22 @@ template <typename DT>
 void peft_bwd_kernel(SoftmaxMeta const *m,
                      BatchConfig const *bc,
                      DT *input_grad_ptr,
-                     DT const *output_grad_ptr,
                      int num_classes,
                      hipStream_t stream) {
-  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
   assert(
       bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
   int i = bc->finetuning_request_index();
-  int tokens_previous_requests =
-      bc->requestsInfo[i].first_token_offset_in_batch;
+
   int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
-  // shift labels by 1 position to the left (ignore first token label)
-  for (int j = 0; j < num_bwd_tokens; j++) {
-    token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
-  }
 
-  DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+  DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch);
   // ignore last token
-  checkCUDA(hipMemsetAsync(input_grad_ptr +
-                               (tokens_previous_requests +
-                                bc->requestsInfo[i].num_tokens_in_batch - 1) *
-                                   num_classes,
+  checkCUDA(hipMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes,
                            0,
                            num_classes * sizeof(DT),
                            stream));
   checkCUDA(hipMemcpyAsync(m->handle.workSpace,
-                           token_ids,
+                           m->peft_token_ids,
                            sizeof(BatchConfig::TokenId) * num_bwd_tokens,
                            hipMemcpyHostToDevice,
                            stream));
@@ -344,8 +364,8 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
       CUDA_NUM_THREADS,
       0,
       stream,
-      input_grad_ptr + tokens_previous_requests * num_classes,
-      output_grad_ptr + tokens_previous_requests * num_classes,
+      input_grad_ptr,
+      static_cast<DT *>(m->output_grad_ptr),
       static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
       num_bwd_tokens,
       num_classes);
@@ -355,7 +375,7 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
                      CUDA_NUM_THREADS,
                      0,
                      stream,
-                     input_grad_ptr + tokens_previous_requests * num_classes,
+                     input_grad_ptr,
                      num_bwd_tokens * num_classes,
                      DT(0.0),
                      scale_factor);
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index 540e15c08..3a7864baf 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -154,7 +154,8 @@ void inference_kernel_wrapper(SoftmaxMeta *m,
                                num_classes,
                                stream);
     if (is_last_op && bc->num_finetuning_fwd_requests() > 0) {
-      Internal::store_peft_activations(m, bc, num_classes, output.get_float_ptr(), stream);
+      Internal::store_peft_activations(
+          m, bc, num_classes, output.get_float_ptr(), stream);
     }
   } else if (m->output_type[0] == DT_HALF) {
     Internal::inference_kernel(m,
@@ -164,7 +165,8 @@ void inference_kernel_wrapper(SoftmaxMeta *m,
                                num_classes,
                                stream);
     if (is_last_op && bc->num_finetuning_fwd_requests() > 0) {
-      Internal::store_peft_activations(m, bc, num_classes, output.get_half_ptr(), stream);
+      Internal::store_peft_activations(
+          m, bc, num_classes, output.get_half_ptr(), stream);
     }
   } else {
     assert(false && "Unsupported data type");
@@ -285,29 +287,31 @@ void inference_kernel(SoftmaxMeta const *m,
 
 template <typename DT>
 void store_peft_activations(SoftmaxMeta *m,
-                    BatchConfig const *bc,
-                    int num_classes,
-                    DT *output_ptr,
-                    cudaStream_t stream) {
+                            BatchConfig const *bc,
+                            int num_classes,
+                            DT *output_ptr,
+                            cudaStream_t stream) {
   assert(m->enable_peft_finetuning);
   assert(m->output_grad_ptr != nullptr);
-  
+
   int num_ft_tokens = bc->num_finetuning_fwd_tokens();
   int i = bc->finetuning_request_index();
-  int tokens_previous_requests = bc->requestsInfo[i].first_token_offset_in_batch;
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
   int prev_steps_tokens = bc->requestsInfo[i].first_token_depth_in_request;
   assert(bc->requestsInfo[i].num_tokens_in_batch == num_ft_tokens);
 
   // shift labels by 1 position to the left (ignore first token label)
-  for (int j = 0; j < num_ft_tokens-1; j++) {
-    m->peft_token_ids[j] = bc->tokensInfo[tokens_previous_requests + prev_steps_tokens + j + 1].token_id;
+  for (int j = 0; j < num_ft_tokens - 1; j++) {
+    m->peft_token_ids[prev_steps_tokens + j] =
+        bc->tokensInfo[tokens_previous_requests + j + 1].token_id;
   }
 
   size_t batch_offset = num_classes * tokens_previous_requests;
   size_t req_offset = num_classes * prev_steps_tokens;
   size_t data_size = num_classes * num_ft_tokens * sizeof(DT);
   assert(m->allocated_peft_buffer_size >= data_size);
-  checkCUDA(cudaMemcpyAsync(static_cast<DT*>(m->output_grad_ptr) + req_offset,
+  checkCUDA(cudaMemcpyAsync(static_cast<DT *>(m->output_grad_ptr) + req_offset,
                             output_ptr + batch_offset,
                             data_size,
                             cudaMemcpyDeviceToDevice,
@@ -343,9 +347,9 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
 
   int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
 
-  DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch );
+  DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch);
   // ignore last token
-    checkCUDA(cudaMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes,
+  checkCUDA(cudaMemsetAsync(input_grad_ptr + num_bwd_tokens * num_classes,
                             0,
                             num_classes * sizeof(DT),
                             stream));
@@ -367,10 +371,8 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
   scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
                  CUDA_NUM_THREADS,
                  0,
-                 stream>>>(input_grad_ptr,
-                           num_bwd_tokens * num_classes,
-                           DT(0.0),
-                           scale_factor);
+                 stream>>>(
+      input_grad_ptr, num_bwd_tokens * num_classes, DT(0.0), scale_factor);
 }
 
 } // namespace Internal
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 55569ea06..207fd382d 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -40,10 +40,9 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 6 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -252,7 +251,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
@@ -260,14 +259,14 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
     if (m->input_type[0] == DT_FLOAT) {
       checkCUDA(hipMemcpyAsync(
           m->input_activation,
-          added_output.get_float_ptr() + first_token_offset * in_dim,
+          input.get_float_ptr() + first_token_offset * in_dim,
           data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
           hipMemcpyDeviceToDevice,
           stream));
     } else if (m->input_type[0] == DT_HALF) {
       checkCUDA(hipMemcpyAsync(
           m->input_activation,
-          added_output.get_half_ptr() + first_token_offset * in_dim,
+          input.get_half_ptr() + first_token_offset * in_dim,
           data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
           hipMemcpyDeviceToDevice,
           stream));
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 60c37ea7a..fecaa067c 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -39,10 +39,9 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 6 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -251,7 +250,7 @@ void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 99e4f100f..7afb7cff8 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -737,8 +737,6 @@ bool Linear::peft_bwd_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   LinearMeta *m = *((LinearMeta **)task->local_args);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  // std::string op_name_without_uid = get_op_name_without_uid(m);
-  // std::cout << "Linear PEFT BWD " << op_name_without_uid << std::endl;
   if (!bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id)) {
     return false;
   }
@@ -772,13 +770,8 @@ bool Linear::peft_bwd_task(Task const *task,
            in_dim,
            num_peft_tokens);
   }
-  peft_bwd_kernel_wrapper(m,
-                          bc,
-                          input_grad.ptr,
-                          output_grad.ptr,
-                          weight.ptr,
-                          in_dim,
-                          out_dim);
+  peft_bwd_kernel_wrapper(
+      m, bc, input_grad.ptr, output_grad.ptr, weight.ptr, in_dim, out_dim);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index 606045daf..001b3f88b 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -42,10 +42,9 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -279,7 +278,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index ff61f6f79..64edcb853 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -41,10 +41,9 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   DataType data_type = ln->data_type;
   size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
   allocated_peft_buffer_size =
-      enable_peft_finetuning
-          ? (data_type_size(data_type) *
-             BatchConfig::max_finetuning_sequence_length() * in_dim)
-          : 0;
+      enable_peft_finetuning ? (data_type_size(data_type) *
+                                BatchConfig::max_sequence_length() * in_dim)
+                             : 0;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3 +
                      allocated_peft_buffer_size;
   gpu_mem_allocator.create_legion_instance(
@@ -277,7 +276,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
     assert(m->allocated_peft_buffer_size ==
            data_type_size(m->input_type[0]) *
-               BatchConfig::max_finetuning_sequence_length() * in_dim);
+               BatchConfig::max_sequence_length() * in_dim);
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 2e3f92a95..77c673f07 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -28,10 +28,10 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
   inference_debugging = ssm->inference_debugging;
   enable_peft_finetuning = ssm->enable_peft_finetuning;
   if (enable_peft_finetuning) {
-    size_t in_dim = ln->inputs[0]->dims[0].size / ln->inputs[0]->dims[0].degree;
-    allocated_peft_buffer_size = 2 * data_type_size(data_type) *
-                                 BatchConfig::max_finetuning_sequence_length() *
-                                 in_dim;
+    size_t in_dim =
+        ssm->inputs[0]->dims[0].size / ssm->inputs[0]->dims[0].degree;
+    allocated_peft_buffer_size = 2 * data_type_size(input_type[0]) *
+                                 BatchConfig::max_sequence_length() * in_dim;
     gpu_mem_allocator.create_legion_instance(
         reserveInst, allocated_peft_buffer_size, "SigmoidSiluMultiMeta");
     input_activation =
@@ -119,13 +119,15 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     int i = bc->finetuning_request_index();
     assert(bc->requestsInfo[i].peft_model_id != PEFTModelID::NO_ID);
     assert(!bc->requestsInfo[i].finetuning_backward_phase);
-    int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+    int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
+    int max_peft_tokens = BatchConfig::max_sequence_length();
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     size_t input_tensor_size =
         data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
-    assert(m->allocated_peft_buffer_size == 2 * input_tensor_size);
+    assert(m->allocated_peft_buffer_size ==
+           2 * (data_type_size(m->input_type[0]) * max_peft_tokens * in_dim));
     // copy input activation
     if (m->input_type[0] == DT_FLOAT) {
       checkCUDA(
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 14df260ea..22c4d79cc 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -30,8 +30,7 @@ SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
     size_t in_dim =
         ssm->inputs[0]->dims[0].size / ssm->inputs[0]->dims[0].degree;
     allocated_peft_buffer_size = 2 * data_type_size(input_type[0]) *
-                                 BatchConfig::max_finetuning_sequence_length() *
-                                 in_dim;
+                                 BatchConfig::max_sequence_length() * in_dim;
     gpu_mem_allocator.create_legion_instance(
         reserveInst, allocated_peft_buffer_size, "SigmoidSiluMultiMeta");
     input_activation =
@@ -122,7 +121,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
     int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     assert(num_peft_tokens == bc->num_finetuning_fwd_tokens());
-    int max_peft_tokens = BatchConfig::max_finetuning_sequence_length();
+    int max_peft_tokens = BatchConfig::max_sequence_length();
     int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
     size_t input_tensor_size =
         data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index fc8a0a96b..808cbf720 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -662,4 +662,4 @@ size_t hash<FlexFlow::SoftmaxParams>::operator()(
   hash_combine(key, params.dim);
   return key;
 }
-}; // namespace std
+}; // namespace std
\ No newline at end of file
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index da52ccea7..851fbdcca 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -23,12 +23,11 @@
 
 namespace FlexFlow {
 
+#define WARP_SIZE 32
+
 // declare Legion names
 using Legion::coord_t;
 using Legion::Memory;
-
-#define WARP_SIZE 32
-
 using namespace Kernels::IncMultiHeadAttention;
 
 namespace Kernels {
@@ -576,6 +575,7 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
                                           compute_type,
                                           HIPBLAS_GEMM_DEFAULT));
 
+    // add alibi position bias to qk production
     if (*m->position_bias) {
       size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
       hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
@@ -710,12 +710,12 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   size_t qkv_proj_size =
       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
-  hipMemcpyAsync(m->devQKVProjArray,
-                 qkv_ptr,
-                 qkv_proj_size *
-                     sizeof(DT), // is this right, do we need layers etc here
-                 hipMemcpyDeviceToDevice,
-                 stream);
+  checkCUDA(hipMemcpyAsync(
+      m->devQKVProjArray,
+      qkv_ptr,
+      qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+      hipMemcpyDeviceToDevice,
+      stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   // TODO WARNING: this is commented out only because we are fixing the inc_attn
   // first
@@ -735,11 +735,11 @@ void inference_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
 
   int num_tokens = bc->num_active_tokens();
 
-  hipMemcpyAsync(output_ptr,
-                 m->attn_heads,
-                 m->oProjSize * num_tokens * sizeof(DT),
-                 hipMemcpyDeviceToDevice,
-                 stream);
+  checkCUDA(hipMemcpyAsync(output_ptr,
+                           m->attn_heads,
+                           m->oProjSize * num_tokens * sizeof(DT),
+                           hipMemcpyDeviceToDevice,
+                           stream));
 }
 
 } // namespace SpecIncMultiHeadSelfAttention
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 6e24326dd..2bfa88bdc 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -622,12 +622,12 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   size_t qkv_proj_size =
       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM * bc->num_active_tokens();
 
-  hipMemcpyAsync(m->devQKVProjArray,
-                 qkv_ptr,
-                 qkv_proj_size *
-                     sizeof(DT), // is this right, do we need layers etc here
-                 hipMemcpyDeviceToDevice,
-                 stream);
+  checkCUDA(hipMemcpyAsync(
+      m->devQKVProjArray,
+      qkv_ptr,
+      qkv_proj_size * sizeof(DT), // is this right, do we need layers etc here
+      hipMemcpyDeviceToDevice,
+      stream));
 
   // phase 1: Implement kernel to compute KQV for input tokens
   // TODO WARNING: this is commented out only because we are fixing the inc_attn
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 8d3606455..e7c72d93a 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce;
 
 /* Params */
 bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) {
-  return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim &&
+  return lhs.layer_guid == rhs.layer_guid &&
+         lhs.allreduce_legion_dim == rhs.allreduce_legion_dim &&
          std::strcmp(lhs.name, rhs.name) == 0;
 }
 
@@ -55,6 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const {
 
 AllReduceParams AllReduce::get_params() const {
   AllReduceParams params;
+  params.layer_guid = this->layer_guid;
   params.allreduce_legion_dim = this->allreduce_dim;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
@@ -63,10 +65,11 @@ AllReduceParams AllReduce::get_params() const {
 }
 
 AllReduce::AllReduce(FFModel &model,
+                     LayerID const &_layer_guid,
                      const ParallelTensor _input,
                      int _allreduce_legion_dim,
                      char const *name)
-    : ParallelOp(model, OP_ALLREDUCE, name, _input),
+    : ParallelOp(model, OP_ALLREDUCE, name, _input), layer_guid(_layer_guid),
       allreduce_dim(_allreduce_legion_dim) {
   int numdim = _input->num_dims;
   ParallelDim dims[MAX_TENSOR_DIM];
@@ -83,7 +86,11 @@ AllReduce::AllReduce(FFModel &model,
                      AllReduceParams const &params,
                      ParallelTensor const input,
                      char const *name)
-    : AllReduce(model, input, params.allreduce_legion_dim, params.name) {}
+    : AllReduce(model,
+                params.layer_guid,
+                input,
+                params.allreduce_legion_dim,
+                params.name) {}
 
 void AllReduce::create_input_partition(FFModel &ff) {
   // Do nothing
@@ -112,6 +119,7 @@ OpMeta *AllReduce::init_task(Task const *task,
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
   std::strcpy(meta->op_name, ar->name);
+  meta->layer_guid = ar->layer_guid;
   return meta;
 }
 
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 3041f9adf..6201962d9 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -86,7 +86,7 @@ void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
   assert(input_grad.domain == output_grad.domain);
   size_t hidden_dim_size =
       input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_active_tokens();
+  size_t num_elements = bc->num_finetuning_bwd_tokens();
   size_t data_size = data_type_size(output_grad.data_type);
   checkCUDA(cudaMemcpyAsync(input_grad.ptr,
                             output_grad.ptr,
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu
index 6800f3ab1..2099347fa 100644
--- a/src/parallel_ops/kernels/parallel_identity_kernels.cu
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu
@@ -76,7 +76,7 @@ void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
   assert(input_grad.domain == output_grad.domain);
   size_t hidden_dim_size =
       input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+  size_t num_elements = bc->num_finetuning_bwd_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
   checkNCCL(ncclAllReduce(output_grad.ptr,
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
index db0265726..723856c81 100644
--- a/src/parallel_ops/parallel_identity.cc
+++ b/src/parallel_ops/parallel_identity.cc
@@ -46,7 +46,8 @@ using namespace FlexFlow::Kernels::ParallelIdentity;
 /* Params */
 bool operator==(ParallelIdentityParams const &lhs,
                 ParallelIdentityParams const &rhs) {
-  return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim &&
+  return lhs.layer_guid == rhs.layer_guid &&
+         lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim &&
          std::strcmp(lhs.name, rhs.name) == 0;
 }
 
@@ -56,6 +57,7 @@ bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const {
 
 ParallelIdentityParams ParallelIdentity::get_params() const {
   ParallelIdentityParams params;
+  params.layer_guid = this->layer_guid;
   params.parallel_identity_legion_dim = this->parallel_identity_dim;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
@@ -64,10 +66,12 @@ ParallelIdentityParams ParallelIdentity::get_params() const {
 }
 
 ParallelIdentity::ParallelIdentity(FFModel &model,
+                                   LayerID const &_layer_guid,
                                    const ParallelTensor _input,
                                    int _parallel_identity_legion_dim,
                                    char const *name)
     : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input),
+      layer_guid(_layer_guid),
       parallel_identity_dim(_parallel_identity_legion_dim) {
   int numdim = _input->num_dims;
   ParallelDim dims[MAX_TENSOR_DIM];
@@ -84,8 +88,11 @@ ParallelIdentity::ParallelIdentity(FFModel &model,
                                    ParallelIdentityParams const &params,
                                    ParallelTensor const input,
                                    char const *name)
-    : ParallelIdentity(
-          model, input, params.parallel_identity_legion_dim, params.name) {}
+    : ParallelIdentity(model,
+                       params.layer_guid,
+                       input,
+                       params.parallel_identity_legion_dim,
+                       params.name) {}
 
 void ParallelIdentity::create_input_partition(FFModel &ff) {
   // Do nothing
@@ -114,6 +121,7 @@ OpMeta *ParallelIdentity::init_task(Task const *task,
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
   std::strcpy(meta->op_name, ar->name);
+  meta->layer_guid = ar->layer_guid;
   return meta;
 }
 
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index f8c4982c9..20fed4955 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -304,4 +304,4 @@ void BatchConfig::save_to_file(std::string const &filename) const {
   }
 }
 
-}; // namespace FlexFlow
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 2bc64c167..1f086cc1a 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2453,6 +2453,9 @@ GraphOptimalViewSerialized
       }
       case OP_ALLREDUCE: {
         AllReduce *allreduce = (AllReduce *)op;
+        sez.serialize(allreduce->layer_guid.id);
+        sez.serialize(allreduce->layer_guid.transformer_layer_id);
+        sez.serialize(allreduce->layer_guid.model_id);
         sez.serialize(allreduce->allreduce_dim);
         sez.serialize(strlen(allreduce->name));
         sez.serialize(allreduce->name, strlen(allreduce->name));
@@ -2460,6 +2463,9 @@ GraphOptimalViewSerialized
       }
       case OP_PARALLEL_IDENTITY: {
         ParallelIdentity *parallel_identity = (ParallelIdentity *)op;
+        sez.serialize(parallel_identity->layer_guid.id);
+        sez.serialize(parallel_identity->layer_guid.transformer_layer_id);
+        sez.serialize(parallel_identity->layer_guid.model_id);
         sez.serialize(parallel_identity->parallel_identity_dim);
         sez.serialize(strlen(parallel_identity->name));
         sez.serialize(parallel_identity->name, strlen(parallel_identity->name));
@@ -3154,6 +3160,11 @@ void FFModel::deserialize_graph_optimal_view(
         break;
       }
       case OP_ALLREDUCE: {
+        size_t id, transformer_layer_id, deserialized_model_id;
+        dez.deserialize(id);
+        dez.deserialize(transformer_layer_id);
+        dez.deserialize(deserialized_model_id);
+        LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
         assert(num_inputs == 1);
         int allreduce_dim;
         dez.deserialize(allreduce_dim);
@@ -3162,12 +3173,18 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
         AllReduceParams params;
+        params.layer_guid = layer_guid;
         params.allreduce_legion_dim = allreduce_dim;
         strcpy(params.name, name);
         node = get_or_create_node<AllReduce>(inputs[0], params);
         break;
       }
       case OP_PARALLEL_IDENTITY: {
+        size_t id, transformer_layer_id, deserialized_model_id;
+        dez.deserialize(id);
+        dez.deserialize(transformer_layer_id);
+        dez.deserialize(deserialized_model_id);
+        LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
         assert(num_inputs == 1);
         int parallel_identity_dim;
         dez.deserialize(parallel_identity_dim);
@@ -3176,6 +3193,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
         ParallelIdentityParams params;
+        params.layer_guid = layer_guid;
         params.parallel_identity_legion_dim = parallel_identity_dim;
         strcpy(params.name, name);
         node = get_or_create_node<ParallelIdentity>(inputs[0], params);
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 5b1f2d19a..c3256374f 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3513,7 +3513,11 @@ void FFModel::create_operators_from_layers() {
           std::to_string(
               transformer_layer_allreduce_count[transformer_layer_id]));
       transformer_layer_allreduce_count[transformer_layer_id]++;
+      LayerID ar_guid = LayerID(this->layer_global_guid++,
+                                op->layer_guid.transformer_layer_id,
+                                this->model_id);
       AllReduce *allreduce = new AllReduce(*this,
+                                           ar_guid,
                                            op->outputs[0],
                                            op->outputs[0]->num_dims - 1,
                                            allreduce_name.c_str());
@@ -3542,15 +3546,20 @@ void FFModel::create_operators_from_layers() {
               transformer_layer_parallel_identity_count[transformer_layer_id]));
       transformer_layer_parallel_identity_count[transformer_layer_id]++;
       ParallelIdentity *parallel_identity = nullptr;
+      LayerID pi_guid = LayerID(this->layer_global_guid++,
+                                op->layer_guid.transformer_layer_id,
+                                this->model_id);
       if (op->numOutputs == 1) {
         parallel_identity =
             new ParallelIdentity(*this,
+                                 pi_guid,
                                  op->outputs[0],
                                  op->outputs[0]->num_dims - 1,
                                  parallel_identity_name.c_str());
       } else if (op->numOutputs == 2) {
         parallel_identity =
             new ParallelIdentity(*this,
+                                 pi_guid,
                                  op->outputs[1],
                                  op->outputs[1]->num_dims - 1,
                                  parallel_identity_name.c_str());
@@ -4685,25 +4694,6 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
-  // RequestManager process_work_from_old_batches_task
-  // {
-  //   TaskVariantRegistrar registrar(
-  //       RM_PROCESS_WORK_FROM_OLD_BATCHES_TASK_ID,
-  //       "RequestManager Process Work from Old Batches");
-  //   registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
-  //   registrar.set_leaf();
-  //   if (pre_register) {
-  //     Runtime::preregister_task_variant<bool,
-  //         RequestManager::process_work_from_old_batches_task>(
-  //         registrar, "RequestManager Process Work from Old Batches Task");
-  //   } else {
-  //     if (enable_control_replication) {
-  //       registrar.global_registration = false;
-  //     }
-  //     runtime->register_task_variant<bool,
-  //         RequestManager::process_work_from_old_batches_task>(registrar);
-  //   }
-  // }
   // RequestManager prepare_next_batch_task
   {
     TaskVariantRegistrar registrar(RM_PREPARE_NEXT_BATCH_TASK_ID,
@@ -7839,4 +7829,4 @@ LEGION_FOREACH_NN(DIMFUNC)
 LEGION_FOREACH_NN(DIMFUNC)
 #undef DIMFUNC
 
-}; // namespace FlexFlow
+}; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index 300a2385c..efb72f331 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -338,4 +338,4 @@ void PEFTMemoryManager::check_ft_model_id(PEFTModelID const &model_id) {
   assert(finetuning_model_id == model_id && "PEFT bwd model is not in memory!");
 }
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 1990f1d11..fc2b45da7 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -284,7 +284,6 @@ RequestManager::RequestManager()
   max_tokens_per_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
-  max_finetuning_sequence_length = -1;
   step_idx = 0;
   run_idx = 0;
 }
@@ -1251,7 +1250,8 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
       new_bc.num_active_tokens();
   new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
       num_peft_tokens;
-  new_bc.requestsInfo[inference_batch_size].max_length = request.dataset[dataset_entry].size();
+  new_bc.requestsInfo[inference_batch_size].max_length =
+      request.dataset[dataset_entry].size();
   new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
   new_bc.requestsInfo[inference_batch_size].peft_model_id =
       request.peft_model_id;
@@ -1312,7 +1312,8 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
       new_bc.num_active_tokens();
   new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
       request.dataset[dataset_entry].size();
-  new_bc.requestsInfo[inference_batch_size].max_length = request.dataset[dataset_entry].size();
+  new_bc.requestsInfo[inference_batch_size].max_length =
+      request.dataset[dataset_entry].size();
   new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
   new_bc.requestsInfo[inference_batch_size].peft_model_id =
       request.peft_model_id;
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index 0e28c02cd..96793abeb 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -3800,14 +3800,18 @@ bool FFModel::convert_graph_to_operators(
       case OP_ALLREDUCE: {
         assert(inList.size() == 1);
         AllReduce *allreduce = (AllReduce *)node.ptr;
-        new_op = new AllReduce(
-            *this, inputs[0], allreduce->allreduce_dim, allreduce->name);
+        new_op = new AllReduce(*this,
+                               allreduce->layer_guid,
+                               inputs[0],
+                               allreduce->allreduce_dim,
+                               allreduce->name);
         break;
       }
       case OP_PARALLEL_IDENTITY: {
         assert(inList.size() == 1);
         ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr;
         new_op = new ParallelIdentity(*this,
+                                      parallel_identity->layer_guid,
                                       inputs[0],
                                       parallel_identity->parallel_identity_dim,
                                       parallel_identity->name);
diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py
index 320966bf1..3a10fc443 100644
--- a/tests/peft/hf_utils.py
+++ b/tests/peft/hf_utils.py
@@ -3,10 +3,8 @@
 import transformers
 from transformers import (
     TrainerCallback,
-    AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
-    LlamaTokenizer,
 )
 import os, shutil
 from peft import PeftConfig, PeftModel
@@ -306,21 +304,10 @@ def build_peft_model(args, peft_config):
 
 def get_peft_tokenizer(args, peft_config):
     # Get Tokenizer
-    hf_config = AutoConfig.from_pretrained(
-        peft_config.base_model_name_or_path, trust_remote_code=True
+    tokenizer = AutoTokenizer.from_pretrained(
+        peft_config.base_model_name_or_path,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
     )
-    hf_arch = getattr(hf_config, "architectures")[0]
-    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            peft_config.base_model_name_or_path,
-            use_fast=True,
-            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
-        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            peft_config.base_model_name_or_path,
-            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
-        )
     if tokenizer.pad_token is None:
         tokenizer.pad_token = "[PAD]"
         tokenizer.padding_side = "left"
@@ -379,4 +366,4 @@ def save_finetuned_model(model, args):
             print(
                 f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..."
             )
-        model.push_to_hub(args.publish_peft_with_id, use_auth_token=True)
+        model.push_to_hub(args.publish_peft_with_id, use_auth_token=True)
\ No newline at end of file