diff --git a/.github/workflows/publish_base_image.yml b/.github/workflows/publish_base_image.yml
deleted file mode 100644
index 26c82726..00000000
--- a/.github/workflows/publish_base_image.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: Publish base docker image
-on:
-  workflow_dispatch:
-
-jobs:
-  publish_base:
-    runs-on: [self-hosted, linux, release]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKER_HUB_USER }}
-          password: ${{ secrets.DOCKER_HUB_TOKEN }}
-
-      - name: Build base for cuda 12.1
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker
-          file: ./docker/Dockerfile.base
-          push: true
-          build-args: |
-            UBUNTU_VERSION=22.04
-            CUDA_VERSION=12.1
-          tags: |
-            vectorchai/scalellm_builder:cuda12.1-ubuntu22.04
-            vectorchai/scalellm_builder:cuda12.1
-
-      - name: Build base for cuda 11.8
-        uses: docker/build-push-action@v5
-        with:
-          context: ./docker
-          file: ./docker/Dockerfile.base
-          push: true
-          build-args: |
-            UBUNTU_VERSION=22.04
-            CUDA_VERSION=11.8
-          tags: |
-            vectorchai/scalellm_builder:cuda11.8-ubuntu22.04
-            vectorchai/scalellm_builder:cuda11.8
-
diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml
index 2dcb688d..ab16d0f5 100644
--- a/.github/workflows/publish_devel_image.yml
+++ b/.github/workflows/publish_devel_image.yml
@@ -1,6 +1,9 @@
 name: Publish devel docker image
 on:
   workflow_dispatch:
+env:
+  # Tells where to store caches.
+  CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache
 
 jobs:
   publish_base:
@@ -27,6 +30,8 @@ jobs:
           context: ./docker
           file: ./docker/Dockerfile.devel
           push: true
+          cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
+          cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
           build-args: |
             UBUNTU_VERSION=22.04
             CUDA_VERSION=12.1
diff --git a/.github/workflows/publish_manylinux_image.yml b/.github/workflows/publish_manylinux_image.yml
new file mode 100644
index 00000000..fd3a49a8
--- /dev/null
+++ b/.github/workflows/publish_manylinux_image.yml
@@ -0,0 +1,52 @@
+name: Publish manylinux docker image
+on:
+  workflow_dispatch:
+env:
+  # Tells where to store caches.
+  CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache
+
+jobs:
+  publish_base:
+    runs-on: [self-hosted, linux, release]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_HUB_USER }}
+          password: ${{ secrets.DOCKER_HUB_TOKEN }}
+
+      - name: Build base for cuda 12.1
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker
+          file: ./docker/Dockerfile.manylinux
+          push: true
+          cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
+          cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
+          build-args: |
+            CUDA_VERSION=12.1
+          tags: |
+            vectorchai/scalellm_manylinux:cuda12.1
+
+      # - name: Build base for cuda 11.8
+      #   uses: docker/build-push-action@v5
+      #   with:
+      #     context: ./docker
+      #     file: ./docker/Dockerfile.manylinux
+      #     push: true
+      #     cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
+      #     cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
+      #     build-args: |
+      #       CUDA_VERSION=11.8
+      #     tags: |
+      #       vectorchai/scalellm_manylinux:cuda11.8
+
diff --git a/.github/workflows/release_wheel.yml b/.github/workflows/release_wheel.yml
index 4aefb1c6..32ac47a7 100644
--- a/.github/workflows/release_wheel.yml
+++ b/.github/workflows/release_wheel.yml
@@ -13,10 +13,11 @@ env:
 jobs:
   build_wheel:
     strategy:
+      fail-fast: false
       matrix:
-        python: ["3.9", "3.10", "3.11", "3.12"]
-        cuda: ["11.8", "12.1"]
-        torch: ["2.1", "2.2", "2.3"]
+        python: ["3.9", "3.10", "3.11"]
+        cuda: ["12.1"]
+        torch: ["2.2", "2.3"]
     runs-on: [self-hosted, linux, release]
     steps:
       - name: Checkout repository
@@ -36,7 +37,7 @@ jobs:
               -e VCPKG_DEFAULT_BINARY_CACHE=/ci_cache/.vcpkg/bincache \
               -e CCACHE_DIR=/ci_cache/.ccache \
               --user $(id -u):$(id -g) \
-              vectorchai/scalellm_builder:cuda${{ matrix.cuda }}-ubuntu22.04 \
+              vectorchai/scalellm_manylinux:cuda${{ matrix.cuda }} \
               bash /ScaleLLM/scripts/build_wheel.sh
         timeout-minutes: 60
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 384009cf..21187549 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 option(USE_CCACHE "Attempt using CCache to wrap the compilation" ON)
 option(USE_CXX11_ABI "Use the new C++-11 ABI, which is not backwards compatible." ON)
+option(USE_MANYLINUX "Build for manylinux" OFF)
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
@@ -144,7 +145,6 @@ if(UNIX)
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og")
 endif()
 
-find_package(Boost CONFIG REQUIRED)
 find_package(Threads REQUIRED)
 # find all dependencies from vcpkg
 find_package(fmt CONFIG REQUIRED)
@@ -162,7 +162,13 @@ find_package(prometheus-cpp CONFIG REQUIRED)
 find_package(stduuid CONFIG REQUIRED)
 find_package(RapidJSON CONFIG REQUIRED)
 
-find_package(Python REQUIRED COMPONENTS Interpreter Development)
+if (USE_MANYLINUX)
+  # manylinux doesn't ship Development.Embed
+  find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+else()
+  find_package(Python REQUIRED COMPONENTS Interpreter Development)
+endif()
+
 find_package(NCCL REQUIRED)
 
 if (USE_CXX11_ABI)
diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base
index fde5c1d4..d3bc5df7 100644
--- a/docker/Dockerfile.base
+++ b/docker/Dockerfile.base
@@ -29,7 +29,7 @@ ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 
 # Install gcc
-ARG GCC_VERSION=12
+ARG GCC_VERSION=11
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
     software-properties-common gpg-agent
diff --git a/docker/Dockerfile.manylinux b/docker/Dockerfile.manylinux
new file mode 100644
index 00000000..8f6c52e4
--- /dev/null
+++ b/docker/Dockerfile.manylinux
@@ -0,0 +1,48 @@
+FROM quay.io/pypa/manylinux_2_28_x86_64 as base
+
+LABEL maintainer="mi@vectorch.com"
+ENV DEBIAN_FRONTEND noninteractive
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+# Install common dependencies
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install cuda, cudnn and nccl
+ARG CUDA_VERSION=12.1
+COPY ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+
+# ARG CMAKE_VERSION=3.18.5
+# COPY ./common/install_cmake.sh install_cmake.sh
+# RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+# RUN rm install_cmake.sh
+
+ARG NINJA_VERSION=1.11.1
+COPY ./common/install_ninja.sh install_ninja.sh
+RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
+RUN rm install_ninja.sh
+
+ARG CCACHE_VERSION=4.8.3
+COPY ./common/install_ccache.sh install_ccache.sh
+RUN if [ -n "${CCACHE_VERSION}" ]; then bash ./install_ccache.sh; fi
+RUN rm install_ccache.sh
+
+# install rust
+ENV RUSTUP_HOME=/usr/local/rustup
+ENV CARGO_HOME=/usr/local/cargo
+ENV PATH=/usr/local/cargo/bin:$PATH
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+# give everyone permission to use rust
+RUN chmod -R a+w ${RUSTUP_HOME} ${CARGO_HOME}
+RUN rustup --version; cargo --version; rustc --version
+
+CMD ["bash"]
\ No newline at end of file
diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh
index cbc6990f..deb23825 100644
--- a/docker/common/install_base.sh
+++ b/docker/common/install_base.sh
@@ -31,11 +31,31 @@ install_ubuntu() {
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 }
 
+install_almalinux() {
+  yum -y update
+  yum -y install \
+    zip \
+    wget \
+    curl \
+    perl \
+    sudo \
+    vim \
+    jq \
+    libtool \
+    unzip
+  
+  # Cleanup
+  yum clean all
+}
+
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
   ubuntu)
     install_ubuntu
     ;;
+  almalinux)
+    install_almalinux
+    ;;
   *)
     echo "Unable to determine OS..."
     exit 1
diff --git a/docker/common/install_ccache.sh b/docker/common/install_ccache.sh
new file mode 100644
index 00000000..4ccf25f0
--- /dev/null
+++ b/docker/common/install_ccache.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CCACHE_VERSION" ]
+
+ARCH=$(uname -m)
+url=https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}-linux-${ARCH}.tar.xz
+
+pushd /tmp
+curl -L "$url" | xz -d | tar -x
+cp ./ccache-${CCACHE_VERSION}-linux-x86_64/ccache /usr/bin/ccache
+popd
+
+# set max cache size to 5GiB
+/usr/bin/ccache -M 5Gi
\ No newline at end of file
diff --git a/docker/common/install_cmake.sh b/docker/common/install_cmake.sh
index 3332ac09..26257bf1 100644
--- a/docker/common/install_cmake.sh
+++ b/docker/common/install_cmake.sh
@@ -10,6 +10,9 @@ case "$ID" in
   ubuntu)
     apt-get remove cmake -y
     ;;
+  almalinux)
+    rm /usr/local/bin/cmake
+    ;;
   *)
     echo "Unable to determine OS..."
     exit 1
diff --git a/docker/common/install_cuda.sh b/docker/common/install_cuda.sh
new file mode 100644
index 00000000..fd32031b
--- /dev/null
+++ b/docker/common/install_cuda.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+
+# adapted from https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
+
+set -ex
+
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    echo "Installing CUDA 11.8 and cuDNN 8.7 and NCCL 2.15 and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz -O cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-8.7.0.84_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_121 {
+    echo "Installing CUDA 12.1 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
+    # install CUDA 12.1.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
+    chmod +x cuda_12.1.1_530.30.02_linux.run
+    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
+    rm -f cuda_12.1.1_530.30.02_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
+    tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
+    cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_052
+
+    ldconfig
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+  chmod +x cuda_12.4.0_550.54.14_linux.run
+  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-8.9.2.26_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
+
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_121 {
+  echo "Pruning CUDA 12.1"
+  #####################################################################################
+  # CUDA 12.1 prune static libs
+  #####################################################################################
+    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
+
+    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 12.1 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-12.1/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    11.8) install_118; prune_118
+        ;;
+    12.1) install_121; prune_121
+        ;;
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
diff --git a/docker/common/install_user.sh b/docker/common/install_user.sh
index 9d416049..8aa49587 100644
--- a/docker/common/install_user.sh
+++ b/docker/common/install_user.sh
@@ -9,8 +9,4 @@ echo "jenkins:x:1000:" >> /etc/group
 echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow
 
 # allow sudo
-echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
-
-
-# test that sudo works
-sudo -u jenkins sudo -v
\ No newline at end of file
+echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins
\ No newline at end of file
diff --git a/python/scalellm/csrc/CMakeLists.txt b/python/scalellm/csrc/CMakeLists.txt
index ec9f1bbd..899f0e2b 100644
--- a/python/scalellm/csrc/CMakeLists.txt
+++ b/python/scalellm/csrc/CMakeLists.txt
@@ -16,5 +16,5 @@ pybind_extension(
     absl::strings
     gflags::gflags
     glog::glog
-    Python::Python
+    Python::Module
 )
diff --git a/python/setup.py b/python/setup.py
index 8a7ea7e9..77e6a11a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -114,6 +114,7 @@ def build_extension(self, ext: CMakeExtension):
             "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
             f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
             "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON",
+            "-DUSE_MANYLINUX:BOOL=ON",
             f"-DPython_EXECUTABLE:FILEPATH={sys.executable}",
             f"-DPYTHON_INCLUDE_DIRS={python_include_dir}",
             f"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}",
diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt
index 6a601ac0..382a54b2 100644
--- a/src/server/CMakeLists.txt
+++ b/src/server/CMakeLists.txt
@@ -27,20 +27,23 @@ cc_library(
     glog::glog
 )
 
-cc_binary(
-  NAME 
-    simple
-  SRCS 
-    simple.cpp
-  DEPS
-    :engine
-    :speculative
-    torch
-    absl::strings
-    gflags::gflags
-    glog::glog
-    Python::Python
-)
+if (NOT USE_MANYLINUX)
+  # manylinux doesn't ship with Development.Embed
+  cc_binary(
+    NAME 
+      simple
+    SRCS 
+      simple.cpp
+    DEPS
+      :engine
+      :speculative
+      torch
+      absl::strings
+      gflags::gflags
+      glog::glog
+      Python::Python
+  )
+endif()
 
 cc_binary(
   NAME