diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
index 995831a..5985ef0 100644
--- a/.github/workflows/build_wheels.yaml
+++ b/.github/workflows/build_wheels.yaml
@@ -138,7 +138,7 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           # Limit MAX_JOBS otherwise the github runner goes OOM
-          MAX_JOBS=2 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          MAX_JOBS=4 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index 4dd1f05..486101e 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.0.8.post15"
+__version__ = "2.0.8.post16"
 
 from flash_attn.flash_attn_interface import flash_attn_func
 from flash_attn.flash_attn_interface import flash_attn_kvpacked_func
diff --git a/setup.py b/setup.py
index 6ddaeae..6e2e918 100644
--- a/setup.py
+++ b/setup.py
@@ -92,9 +92,9 @@ def raise_if_cuda_home_none(global_option: str) -> None:
 
 
 def append_nvcc_threads(nvcc_extra_args):
-    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
-    if bare_metal_version >= Version("11.2"):
-        return nvcc_extra_args + ["--threads", "4"]
+    # _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    # if bare_metal_version >= Version("11.2"):
+    #     return nvcc_extra_args + ["--threads", "4"]
     return nvcc_extra_args