From b9e7d38d7ff6c114affcecd440bfa3c5ffcb79dd Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Wed, 5 Mar 2025 12:48:49 -0800
Subject: [PATCH] Extended `<cuda/bit>` operations: `bitfield_insert`,
 `bitfield_extract`, `bit_reverse`, `bitmask` (#3941)

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 docs/libcudacxx/extended_api.rst              |   1 +
 docs/libcudacxx/extended_api/bit.rst          |  37 ++++
 .../extended_api/bit/bit_reverse.rst          |  66 +++++++
 .../extended_api/bit/bitfield_extract.rst     |  69 ++++++++
 .../extended_api/bit/bitfield_insert.rst      |  70 ++++++++
 docs/libcudacxx/extended_api/bit/bitmask.rst  |  69 ++++++++
 libcudacxx/include/cuda/__bit/bit_reverse.h   | 167 ++++++++++++++++++
 libcudacxx/include/cuda/__bit/bitfield.h      | 118 +++++++++++++
 libcudacxx/include/cuda/__bit/bitmask.h       |  84 +++++++++
 libcudacxx/include/cuda/bit                   |  28 +++
 libcudacxx/include/cuda/std/__cccl/builtin.h  |  16 ++
 .../libcudacxx/cuda/bit/bit_reverse.pass.cpp  |  62 +++++++
 .../libcudacxx/cuda/bit/bitfield.fail.cpp     |  33 ++++
 .../libcudacxx/cuda/bit/bitfield.pass.cpp     |  80 +++++++++
 .../test/libcudacxx/cuda/bit/bitmask.pass.cpp |  62 +++++++
 15 files changed, 962 insertions(+)
 create mode 100644 docs/libcudacxx/extended_api/bit.rst
 create mode 100644 docs/libcudacxx/extended_api/bit/bit_reverse.rst
 create mode 100644 docs/libcudacxx/extended_api/bit/bitfield_extract.rst
 create mode 100644 docs/libcudacxx/extended_api/bit/bitfield_insert.rst
 create mode 100644 docs/libcudacxx/extended_api/bit/bitmask.rst
 create mode 100644 libcudacxx/include/cuda/__bit/bit_reverse.h
 create mode 100644 libcudacxx/include/cuda/__bit/bitfield.h
 create mode 100644 libcudacxx/include/cuda/__bit/bitmask.h
 create mode 100644 libcudacxx/include/cuda/bit
 create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp

diff --git a/docs/libcudacxx/extended_api.rst b/docs/libcudacxx/extended_api.rst
index 1df995a432f..0c7a5b21a6a 100644
--- a/docs/libcudacxx/extended_api.rst
+++ b/docs/libcudacxx/extended_api.rst
@@ -6,6 +6,7 @@ Extended API
 .. toctree::
    :maxdepth: 2
 
+   extended_api/bit
    extended_api/execution_model
    extended_api/memory_model
    extended_api/thread_groups
diff --git a/docs/libcudacxx/extended_api/bit.rst b/docs/libcudacxx/extended_api/bit.rst
new file mode 100644
index 00000000000..7e433913089
--- /dev/null
+++ b/docs/libcudacxx/extended_api/bit.rst
@@ -0,0 +1,37 @@
+.. _libcudacxx-extended-api-bit:
+
+Bit
+===
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+
+   cuda::bitmask <bit/bitmask>
+   cuda::bit_reverse <bit/bit_reverse>
+   cuda::bitfield_insert <bit/bitfield_insert>
+   cuda::bitfield_extract <bit/bitfield_extract>
+
+.. list-table::
+   :widths: 25 45 30 30
+   :header-rows: 0
+
+   * - :ref:`bitmask <libcudacxx-extended-api-bit-bitmask>`
+     - Generate a bitmask
+     - CCCL 3.0.0
+     - CUDA 13.0
+
+   * - :ref:`bit_reverse <libcudacxx-extended-api-bit-bit_reverse>`
+     - Reverse the order of bits
+     - CCCL 3.0.0
+     - CUDA 13.0
+
+   * - :ref:`bitfield_insert <libcudacxx-extended-api-bit-bitfield_insert>`
+     - Insert a bitfield
+     - CCCL 3.0.0
+     - CUDA 13.0
+
+   * - :ref:`bitfield_extract <libcudacxx-extended-api-bit-bitfield_extract>`
+     - Extract a bitfield
+     - CCCL 3.0.0
+     - CUDA 13.0
diff --git a/docs/libcudacxx/extended_api/bit/bit_reverse.rst b/docs/libcudacxx/extended_api/bit/bit_reverse.rst
new file mode 100644
index 00000000000..63dd7b3fcd6
--- /dev/null
+++ b/docs/libcudacxx/extended_api/bit/bit_reverse.rst
@@ -0,0 +1,66 @@
+.. _libcudacxx-extended-api-bit-bit_reverse:
+
+``bit_reverse``
+===============
+
+.. code:: cpp
+
+   template <typename T>
+   [[nodiscard]] constexpr T
+   bit_reverse(T value) noexcept;
+
+The function reverses the order of bits in a value.
+
+**Parameters**
+
+- ``value``: Input value
+
+**Return value**
+
+- Value with reversed bits
+
+**Mandates**
+
+- ``T`` is an unsigned integer type.
+
+**Performance considerations**
+
+The function performs the following operations:
+
+- Device:
+
+    - ``uint8_t`` ``uint16_t``: ``PRMT``, ``BREV``
+    - ``uint32_t``: ``BREV``
+    - ``uint64_t``: ``BREV`` x2, ``MOV`` x2
+    - ``uint128_t``: ``BREV`` x4, ``MOV`` x4
+
+- Host: ``__builtin_bitreverse<N>`` with clang
+
+.. note::
+
+    When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal.
+
+.. note::
+
+    GCC <= 8 uses a slow path with more instructions even in CUDA
+
+Example
+-------
+
+.. code:: cpp
+
+    #include <cuda/bit>
+    #include <cuda/std/cassert>
+
+    __global__ void bit_reverse_kernel() {
+        assert(bitfield_reverse(0u) == ~0u);
+        assert(bitfield_reverse(uint8_t{0b00001011}) == uint8_t{0b11010000});
+    }
+
+    int main() {
+        bit_reverse_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/K36dvoh58>`_
diff --git a/docs/libcudacxx/extended_api/bit/bitfield_extract.rst b/docs/libcudacxx/extended_api/bit/bitfield_extract.rst
new file mode 100644
index 00000000000..2d9d59c72ff
--- /dev/null
+++ b/docs/libcudacxx/extended_api/bit/bitfield_extract.rst
@@ -0,0 +1,69 @@
+.. _libcudacxx-extended-api-bit-bitfield_extract:
+
+``bitfield_extract``
+====================
+
+.. code:: cpp
+
+   template <typename T>
+   [[nodiscard]] constexpr T
+   bitfield_extract(T value, int start, int width) noexcept;
+
+The function extracts a bitfield from a value and returns it in the lower bits.
+``bitfield_extract()`` computes ``(value >> start) & bitfield``, where ``bitfield`` is a sequence of bits of width ``width``.
+
+**Parameters**
+
+- ``value``: The value to apply the bitfield.
+- ``start``:  Initial position of the bitfield.
+- ``width``:  Width of the bitfield.
+
+**Return value**
+
+- ``(value >> start) & bitfield``.
+
+**Mandates**
+
+- ``T`` is an unsigned integer type.
+
+**Preconditions**
+
+    - ``start >= 0 && start < num_bits(T)``
+    - ``width >  0 && width <= num_bits(T)``
+    - ``start + width <= num_bits(T)``
+
+**Performance considerations**
+
+The function performs the following operations in CUDA for ``uint8_t``, ``uint16_t``, ``uint32_t``:
+
+- ``SM < 70``: ``BFE``
+- ``SM >= 70``: ``BMSK``, bitwise operation x2
+
+.. note::
+
+    When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal.
+
+.. note::
+
+    GCC <= 8 uses a slow path with more instructions even in CUDA
+
+Example
+-------
+
+.. code:: cpp
+
+    #include <cuda/bit>
+    #include <cuda/std/cassert>
+
+    __global__ void bitfield_insert_kernel() {
+        assert(cuda::bitfield_extract(~0u, 0, 4) == 0b1111);
+        assert(cuda::bitfield_extract(0b1011000, 3, 4) == 0b1011);
+    }
+
+    int main() {
+        bitfield_insert_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/WvqfG9nbP>`_
diff --git a/docs/libcudacxx/extended_api/bit/bitfield_insert.rst b/docs/libcudacxx/extended_api/bit/bitfield_insert.rst
new file mode 100644
index 00000000000..23015d65f05
--- /dev/null
+++ b/docs/libcudacxx/extended_api/bit/bitfield_insert.rst
@@ -0,0 +1,70 @@
+.. _libcudacxx-extended-api-bit-bitfield_insert:
+
+``bitfield_insert``
+===================
+
+.. code:: cpp
+
+   template <typename T>
+   [[nodiscard]] constexpr T
+   bitfield_insert(T dest, T source, int start, int width) noexcept;
+
+The function extracts the lower bitfield of size ``width`` from ``source`` and inserts it into ``dest`` at position ``start``.
+
+**Parameters**
+
+- ``dest``:   The value to insert the bitfield.
+- ``source``: The value from which extract the bitfield.
+- ``start``:  Initial position of the bitfield.
+- ``width``:  Width of the bitfield.
+
+**Return value**
+
+- ``((value << start) & mask) | (source & ~mask)``, where ``mask`` is a bitmask of width ``width``.
+
+**Mandates**
+
+- ``T`` is an unsigned integer type.
+
+**Preconditions**
+
+    - ``start >= 0 && start < num_bits(T)``
+    - ``width >  0 && width <= num_bits(T)``
+    - ``start + width <= num_bits(T)``
+
+**Performance considerations**
+
+The function performs the following operations in CUDA for ``uint8_t``, ``uint16_t``, ``uint32_t``:
+
+- ``SM < 70``: ``BFI``
+- ``SM >= 70``: ``BMSK``, bitwise operation x5
+
+.. note::
+
+    When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal.
+
+.. note::
+
+    GCC <= 8 uses a slow path with more instructions even in CUDA
+
+Example
+-------
+
+.. code:: cpp
+
+    #include <cuda/bit>
+    #include <cuda/std/cassert>
+
+    __global__ void bitfield_insert_kernel() {
+        assert(cuda::bitfield_insert(0u, 0xFFFFu, 0, 4) == 0b1111);
+        assert(cuda::bitfield_insert(0u, 0xFFFFu, 3, 4) == 0b1111000);
+        assert(cuda::bitfield_insert(1u, 0xFFFFu, 3, 4) == 0b1111001);
+    }
+
+    int main() {
+        bitfield_insert_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/Phs8czqes>`_
diff --git a/docs/libcudacxx/extended_api/bit/bitmask.rst b/docs/libcudacxx/extended_api/bit/bitmask.rst
new file mode 100644
index 00000000000..0308645df95
--- /dev/null
+++ b/docs/libcudacxx/extended_api/bit/bitmask.rst
@@ -0,0 +1,69 @@
+.. _libcudacxx-extended-api-bit-bitmask:
+
+``bitmask``
+===========
+
+.. code:: cpp
+
+   template <typename T>
+   [[nodiscard]] constexpr T
+   bitmask(int start, int width) noexcept;
+
+The function generates a bitmask of size ``width`` starting at position ``start``.
+
+**Parameters**
+
+- ``start``: starting position of the bitmask
+- ``width``: width of the bitmask
+
+**Return value**
+
+- Bitmask of size ``width`` starting at ``start``
+
+**Mandates**
+
+- ``T`` is an unsigned integral type.
+
+**Preconditions**
+
+    - ``start >= 0 && start < num_bits(T)``
+    - ``width >  0 && width <= num_bits(T)``
+    - ``start + width <= num_bits(T)``
+
+**Performance considerations**
+
+The function performs the following operations in device code:
+
+- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``BMSK``
+- ``uint64_t``: ``SHL`` x4, ``UADD`` x2
+- ``uint128_t``: ``SHL`` x8, ``UADD`` x4
+
+.. note::
+
+    When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal.
+
+.. note::
+
+    GCC <= 8 uses a slow path with more instructions even in CUDA
+
+Example
+-------
+
+.. code:: cpp
+
+    #include <cuda/bit>
+    #include <cuda/std/cassert>
+    #include <cuda/std/cstdint>
+
+    __global__ void bitmask_kernel() {
+        assert(cuda::bitmask<uint32_t>(2, 4) == 0b111100u);
+        assert(cuda::bitmask<uint64_t>(1, 3) == uint64_t{0b1110});
+    }
+
+    int main() {
+        bitmask_kernel<<<1, 1>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/PPqP8rTPd>`_
diff --git a/libcudacxx/include/cuda/__bit/bit_reverse.h b/libcudacxx/include/cuda/__bit/bit_reverse.h
new file mode 100644
index 00000000000..1c02074dbe9
--- /dev/null
+++ b/libcudacxx/include/cuda/__bit/bit_reverse.h
@@ -0,0 +1,167 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BIT_BIT_REVERSE_H
+#define _CUDA___BIT_BIT_REVERSE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+#if defined(_CCCL_BUILTIN_BITREVERSE32)
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __bit_reverse_builtin(_Tp __value) noexcept
+{
+#  if _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
+  {
+    auto __high = static_cast<__uint128_t>(_CCCL_BUILTIN_BITREVERSE64(static_cast<uint64_t>(__value))) << 64;
+    auto __low  = static_cast<__uint128_t>(_CCCL_BUILTIN_BITREVERSE64(static_cast<uint64_t>(__value >> 64)));
+    return __high | __low;
+  }
+#  endif // _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(uint64_t))
+  {
+    return _CCCL_BUILTIN_BITREVERSE64(__value);
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    return _CCCL_BUILTIN_BITREVERSE32(__value);
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint16_t))
+  {
+    return _CCCL_BUILTIN_BITREVERSE16(__value);
+  }
+  else
+  {
+    return _CCCL_BUILTIN_BITREVERSE8(__value);
+  }
+}
+
+#endif // defined(_CCCL_BUILTIN_BITREVERSE32)
+
+#if _CCCL_HAS_CUDA_COMPILER
+
+template <typename _Tp>
+_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr _Tp __bit_reverse_device(_Tp __value) noexcept
+{
+#  if _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
+  {
+    auto __high = static_cast<__uint128_t>(::cuda::__bit_reverse_device(static_cast<uint64_t>(__value))) << 64;
+    auto __low  = static_cast<__uint128_t>(::cuda::__bit_reverse_device(static_cast<uint64_t>(__value >> 64)));
+    return __high | __low;
+  }
+#  endif // _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(uint64_t))
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (return __brevll(__value);))
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (return __brev(__value);))
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint16_t))
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (return __brev(static_cast<uint32_t>(__value) << 16);))
+  }
+  else
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (return __brev(static_cast<uint32_t>(__value) << 24);))
+  }
+  _CCCL_UNREACHABLE();
+}
+
+#endif // _CCCL_HAS_CUDA_COMPILER
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __bit_reverse_generic(_Tp __value) noexcept
+{
+#if _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
+  {
+    constexpr auto __c1 = __uint128_t{0x5555555555555555} << 64 | uint64_t{0x5555555555555555};
+    constexpr auto __c2 = __uint128_t{0x3333333333333333} << 64 | uint64_t{0x3333333333333333};
+    constexpr auto __c3 = __uint128_t{0x0F0F0F0F0F0F0F0F} << 64 | uint64_t{0x0F0F0F0F0F0F0F0F};
+    constexpr auto __c4 = __uint128_t{0x00FF00FF00FF00FF} << 64 | uint64_t{0x00FF00FF00FF00FF};
+    constexpr auto __c5 = __uint128_t{0x0000FFFF0000FFFF} << 64 | uint64_t{0x0000FFFF0000FFFF};
+    constexpr auto __c6 = __uint128_t{0x00000000FFFFFFFF} << 64 | uint64_t{0x00000000FFFFFFFF};
+    __value             = ((__value >> 1) & __c1) | ((__value & __c1) << 1);
+    __value             = ((__value >> 2) & __c2) | ((__value & __c2) << 2);
+    __value             = ((__value >> 4) & __c3) | ((__value & __c3) << 4);
+    __value             = ((__value >> 8) & __c4) | ((__value & __c4) << 8);
+    __value             = ((__value >> 16) & __c5) | ((__value & __c5) << 16);
+    __value             = ((__value >> 32) & __c6) | ((__value & __c6) << 32);
+    return (__value >> 64) | (__value << 64);
+  }
+#endif // _CCCL_HAS_INT128()
+  if constexpr (sizeof(_Tp) == sizeof(uint64_t))
+  {
+    __value = ((__value >> 1) & 0x5555555555555555) | ((__value & 0x5555555555555555) << 1);
+    __value = ((__value >> 2) & 0x3333333333333333) | ((__value & 0x3333333333333333) << 2);
+    __value = ((__value >> 4) & 0x0F0F0F0F0F0F0F0F) | ((__value & 0x0F0F0F0F0F0F0F0F) << 4);
+    __value = ((__value >> 8) & 0x00FF00FF00FF00FF) | ((__value & 0x00FF00FF00FF00FF) << 8);
+    __value = ((__value >> 16) & 0x0000FFFF0000FFFF) | ((__value & 0x0000FFFF0000FFFF) << 16);
+    return (__value >> 32) | (__value << 32);
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint32_t))
+  {
+    __value = ((__value >> 1) & 0x55555555) | ((__value & 0x55555555) << 1);
+    __value = ((__value >> 2) & 0x33333333) | ((__value & 0x33333333) << 2);
+    __value = ((__value >> 4) & 0x0F0F0F0F) | ((__value & 0x0F0F0F0F) << 4);
+    __value = ((__value >> 8) & 0x00FF00FF) | ((__value & 0x00FF00FF) << 8);
+    return (__value >> 16) | (__value << 16);
+  }
+  else if constexpr (sizeof(_Tp) == sizeof(uint16_t))
+  {
+    __value = ((__value >> 1) & 0x5555) | ((__value & 0x5555) << 1);
+    __value = ((__value >> 2) & 0x3333) | ((__value & 0x3333) << 2);
+    __value = ((__value >> 4) & 0x0F0F) | ((__value & 0x0F0F) << 4);
+    return (__value >> 8) | (__value << 8);
+  }
+  else
+  {
+    __value = ((__value >> 1) & 0x55) | ((__value & 0x55) << 1);
+    __value = ((__value >> 2) & 0x33) | ((__value & 0x33) << 2);
+    return (__value >> 4) | (__value << 4);
+  }
+}
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_reverse(_Tp __value) noexcept
+{
+  static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bit_reverse() requires unsigned integer types");
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::__bit_reverse_device(__value);))
+  }
+#if defined(_CCCL_BUILTIN_BITREVERSE32)
+  return ::cuda::__bit_reverse_builtin(__value);
+#else
+  return ::cuda::__bit_reverse_generic(__value);
+#endif
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BIT_BIT_REVERSE_H
diff --git a/libcudacxx/include/cuda/__bit/bitfield.h b/libcudacxx/include/cuda/__bit/bitfield.h
new file mode 100644
index 00000000000..b39e623c497
--- /dev/null
+++ b/libcudacxx/include/cuda/__bit/bitfield.h
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BIT_BITFILED_INSERT_EXTRACT_H
+#define _CUDA___BIT_BITFILED_INSERT_EXTRACT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__bit/bitmask.h>
+#include <cuda/std/__limits/numeric_limits.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+#if __cccl_ptx_isa >= 200
+
+_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint32_t
+__bfi(uint32_t __dest, uint32_t __source, int __start, int __width) noexcept
+{
+  asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(__dest) : "r"(__source), "r"(__dest), "r"(__start), "r"(__width));
+  return __dest;
+}
+
+_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint64_t
+__bfi(uint64_t __dest, uint64_t __source, int __start, int __width) noexcept
+{
+  asm("bfi.b64 %0, %1, %2, %3, %4;" : "=l"(__dest) : "l"(__source), "l"(__dest), "r"(__start), "r"(__width));
+  return __dest;
+}
+
+_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint32_t __bfe(uint32_t __value, int __start, int __width) noexcept
+{
+  uint32_t __ret;
+  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(__ret) : "r"(__value), "r"(__start), "r"(__width));
+  return __ret;
+}
+
+_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint64_t __bfe(uint64_t __value, int __start, int __width) noexcept
+{
+  uint64_t __ret;
+  asm("bfe.u64 %0, %1, %2, %3;" : "=l"(__ret) : "l"(__value), "r"(__start), "r"(__width));
+  return __ret;
+}
+
+#endif // __cccl_ptx_isa >= 200
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp
+bitfield_insert(const _Tp __dest, const _Tp __source, int __start, int __width) noexcept
+{
+  static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitfield_insert() requires unsigned integer types");
+  constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits;
+  _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range");
+  _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range");
+  _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range");
+  if constexpr (sizeof(_Tp) <= sizeof(uint64_t))
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // clang-format off
+      NV_DISPATCH_TARGET( // all SM < 70
+        NV_PROVIDES_SM_70, (;),
+        NV_IS_DEVICE,      (using _Up = _CUDA_VSTD::_If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
+                            return ::cuda::__bfi(static_cast<_Up>(__dest), static_cast<_Up>(__source),
+                                                 __start, __width);))
+      // clang-format on
+    }
+  }
+  auto __mask = ::cuda::bitmask<_Tp>(__start, __width);
+  return ((__source << __start) & __mask) | (__dest & ~__mask);
+}
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp
+bitfield_extract(const _Tp __value, int __start, int __width) noexcept
+{
+  static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitfield_extract() requires unsigned integer types");
+  constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits;
+  _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range");
+  _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range");
+  _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range");
+  if constexpr (sizeof(_Tp) <= sizeof(uint32_t))
+  {
+    if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+    {
+      // clang-format off
+      NV_DISPATCH_TARGET( // all SM < 70
+        NV_PROVIDES_SM_70, (;),
+        NV_IS_DEVICE,      (using _Up = _CUDA_VSTD::_If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
+                            return ::cuda::__bfe(static_cast<_Up>(__value), __start, __width);))
+      // clang-format on
+    }
+  }
+  return ((__value >> __start) & ::cuda::bitmask<_Tp>(0, __width));
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BIT_BITFILED_INSERT_EXTRACT_H
diff --git a/libcudacxx/include/cuda/__bit/bitmask.h b/libcudacxx/include/cuda/__bit/bitmask.h
new file mode 100644
index 00000000000..780847ae4c9
--- /dev/null
+++ b/libcudacxx/include/cuda/__bit/bitmask.h
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BIT_BITMASK_H
+#define _CUDA___BIT_BITMASK_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__ptx/instructions/bmsk.h>
+#include <cuda/__ptx/instructions/shl.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_constant_evaluated.h>
+#include <cuda/std/__type_traits/is_unsigned_integer.h>
+#include <cuda/std/limits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __shl(const _Tp __value, int __shift) noexcept
+{
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+  {
+    if constexpr (sizeof(_Tp) <= sizeof(uint64_t))
+    {
+      NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                         (using _Up = _CUDA_VSTD::_If<sizeof(_Tp) <= sizeof(uint32_t), uint32_t, uint64_t>;
+                          return _CUDA_VPTX::shl(static_cast<_Up>(__value), __shift);))
+    }
+#if _CCCL_HAS_INT128()
+    else
+    {
+      // the compiler should generate exactly four 32-bit shl instructions
+      NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                         (auto __low  = _CUDA_VPTX::shl(static_cast<uint64_t>(__value), __shift);
+                          auto __high = _CUDA_VPTX::shl(static_cast<uint64_t>(__value >> 64), __shift);
+                          return __low | (static_cast<__uint128_t>(__high) << 64);))
+    }
+#endif // _CCCL_HAS_INT128()
+  }
+  constexpr auto __all_ones = static_cast<_Tp>(~_Tp{0});
+  return (__shift == _CUDA_VSTD::numeric_limits<_Tp>::digits) ? __all_ones : __value << __shift;
+}
+
+template <typename _Tp>
+_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bitmask(int __start, int __width) noexcept
+{
+  static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitmask() requires unsigned integer types");
+  constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits;
+  _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range");
+  _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range");
+  _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range");
+  if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated())
+  {
+    if constexpr (sizeof(_Tp) <= sizeof(uint32_t))
+    {
+      NV_IF_TARGET(NV_PROVIDES_SM_70, (return _CUDA_VPTX::bmsk_clamp(__start, __width);))
+    }
+    else
+    {
+      NV_IF_TARGET(NV_IS_DEVICE, (return (::cuda::__shl(_Tp{1}, __width) - 1) << __start;))
+    }
+  }
+  constexpr auto __all_ones = static_cast<_Tp>(~_Tp{0});
+  return __width == __digits ? __all_ones : (static_cast<_Tp>((_Tp{1} << __width) - 1) << __start);
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BIT_BITMASK_H
diff --git a/libcudacxx/include/cuda/bit b/libcudacxx/include/cuda/bit
new file mode 100644
index 00000000000..82f0bc840e8
--- /dev/null
+++ b/libcudacxx/include/cuda/bit
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_BIT
+#define _CUDA_BIT
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__bit/bit_reverse.h>
+#include <cuda/__bit/bitfield.h>
+#include <cuda/std/bit>
+
+#endif // _CUDA_BIT
diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h
index 5377777751a..4649a628dfc 100644
--- a/libcudacxx/include/cuda/std/__cccl/builtin.h
+++ b/libcudacxx/include/cuda/std/__cccl/builtin.h
@@ -237,6 +237,22 @@
 #  undef _CCCL_BUILTIN_BSWAP128
 #endif // _CCCL_CUDA_COMPILER(NVCC)
 
+#if _CCCL_CHECK_BUILTIN(builtin_bitreverse8) && !_CCCL_HAS_CUDA_COMPILER
+#  define _CCCL_BUILTIN_BITREVERSE8(...) __builtin_bitreverse8(__VA_ARGS__)
+#endif
+
+#if _CCCL_CHECK_BUILTIN(builtin_bitreverse16) && !_CCCL_HAS_CUDA_COMPILER
+#  define _CCCL_BUILTIN_BITREVERSE16(...) __builtin_bitreverse16(__VA_ARGS__)
+#endif
+
+#if _CCCL_CHECK_BUILTIN(builtin_bitreverse32) && !_CCCL_HAS_CUDA_COMPILER
+#  define _CCCL_BUILTIN_BITREVERSE32(...) __builtin_bitreverse32(__VA_ARGS__)
+#endif
+
+#if _CCCL_CHECK_BUILTIN(builtin_bitreverse64) && !_CCCL_HAS_CUDA_COMPILER
+#  define _CCCL_BUILTIN_BITREVERSE64(...) __builtin_bitreverse64(__VA_ARGS__)
+#endif
+
 #if _CCCL_CHECK_BUILTIN(builtin_cbrt) || _CCCL_COMPILER(GCC)
 #  define _CCCL_BUILTIN_CBRTF(...) __builtin_cbrtf(__VA_ARGS__)
 #  define _CCCL_BUILTIN_CBRT(...)  __builtin_cbrt(__VA_ARGS__)
diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp
new file mode 100644
index 00000000000..d25ed67044e
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ constexpr bool test()
+{
+  using nl              = cuda::std::numeric_limits<T>;
+  constexpr T all_ones  = static_cast<T>(~T{0});
+  constexpr T half_low  = all_ones >> (nl::digits / 2u);
+  constexpr T half_high = static_cast<T>(all_ones << (nl::digits / 2u));
+  static_assert(cuda::bit_reverse(all_ones) == all_ones);
+  static_assert(cuda::bit_reverse(T{0}) == T{0});
+  static_assert(cuda::bit_reverse(half_low) == half_high);
+  static_assert(cuda::bit_reverse(T{0b11001001}) == (T{0b10010011} << (nl::digits - 8u)));
+  static_assert(cuda::bit_reverse(T{T{0b10010011} << (nl::digits - 8u)}) == T{0b11001001});
+  unused(all_ones);
+  unused(half_low);
+  unused(half_high);
+  return true;
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned>();
+  test<unsigned long>();
+  test<unsigned long long>();
+
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<size_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp
new file mode 100644
index 00000000000..1e4f4bd6efb
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+int main(int, char**)
+{
+  using T = uint32_t;
+  static_assert(cuda::bitfield_insert(int{0}, int{0}, 0, 0));
+  static_assert(cuda::bitfield_insert(T{0}, T{0}, -1, 0));
+  static_assert(cuda::bitfield_insert(T{0}, T{0}, 0, -1));
+  static_assert(cuda::bitfield_insert(T{0}, T{0}, 0, 33));
+  static_assert(cuda::bitfield_insert(T{0}, T{0}, 32, 0));
+  static_assert(cuda::bitfield_insert(T{0}, T{0}, 20, 20));
+
+  static_assert(cuda::bitfield_extract(T{0}, -1, 0));
+  static_assert(cuda::bitfield_extract(T{0}, 0, -1));
+  static_assert(cuda::bitfield_extract(T{0}, 0, 33));
+  static_assert(cuda::bitfield_extract(T{0}, 32, 0));
+  static_assert(cuda::bitfield_extract(T{0}, 20, 20));
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp
new file mode 100644
index 00000000000..5099b7b87d2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ constexpr bool test()
+{
+  using nl             = cuda::std::numeric_limits<T>;
+  constexpr T all_ones = static_cast<T>(~T{0});
+  unused(all_ones);
+  assert(cuda::bitfield_insert(T{0}, all_ones, 0, 1) == 1);
+  assert(cuda::bitfield_insert(T{0}, all_ones, 1, 1) == 0b10);
+  assert(cuda::bitfield_insert(T{0b10}, all_ones, 0, 1) == 0b11);
+  assert(cuda::bitfield_insert(all_ones, all_ones, 0, 1) == all_ones);
+  assert(cuda::bitfield_insert(all_ones, all_ones, 2, 1) == all_ones);
+  assert(cuda::bitfield_insert(all_ones, T{0b1000}, 1, 2) == (all_ones & static_cast<T>(~T{0b110})));
+
+  assert(cuda::bitfield_insert(T{0}, all_ones, 0, 2) == 0b11);
+  assert(cuda::bitfield_insert(T{0}, all_ones, 3, 2) == 0b11000);
+  assert(cuda::bitfield_insert(T{0b10100000}, all_ones, 3, 2) == 0b10111000);
+  assert(cuda::bitfield_insert(T{0b10100000}, T{0b11}, 3, 2) == 0b10111000);
+  assert(cuda::bitfield_insert(T{0}, all_ones, nl::digits - 1, 1) == (T{1} << (nl::digits - 1u)));
+  assert(cuda::bitfield_insert(T{0b10100000}, all_ones, 0, nl::digits) == all_ones);
+
+  assert(cuda::bitfield_extract(T{0}, 3, 4) == 0);
+  assert(cuda::bitfield_extract(T{0b1011}, 0, 1) == 1);
+  assert(cuda::bitfield_extract(T{0b1011}, 1, 1) == 1);
+  assert(cuda::bitfield_extract(T{0b1011}, 2, 2) == 0b10);
+  assert(cuda::bitfield_extract(all_ones, 0, 4) == 0b1111);
+  assert(cuda::bitfield_extract(all_ones, 2, 4) == 0b1111);
+
+  assert(cuda::bitfield_extract(T{0b1010010}, 0, 2) == 0b10);
+  assert(cuda::bitfield_extract(T{0b10101100}, 3, 2) == 1);
+  assert(cuda::bitfield_extract(T{0b10100000}, 3, 3) == 0b100);
+
+  assert(cuda::bitfield_extract(T{all_ones}, nl::digits - 1, 1) == 1);
+  assert(cuda::bitfield_extract(T{0b10100000}, 0, nl::digits) == T{0b10100000});
+  return true;
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned>();
+  test<unsigned long>();
+  test<unsigned long long>();
+
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<size_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp
new file mode 100644
index 00000000000..5ec6960e09d
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+template <typename T>
+__host__ __device__ constexpr bool test()
+{
+  using nl             = cuda::std::numeric_limits<T>;
+  constexpr T all_ones = static_cast<T>(~T{0});
+  unused(all_ones);
+  assert(cuda::bitmask<T>(0, 1) == 1);
+  assert(cuda::bitmask<T>(1, 1) == 0b10);
+  assert(cuda::bitmask<T>(0, 2) == 0b11);
+  assert(cuda::bitmask<T>(2, 2) == 0b1100);
+
+  assert(cuda::bitmask<T>(0, 2) == 0b11);
+  assert(cuda::bitmask<T>(3, 2) == 0b11000);
+  assert(cuda::bitmask<T>(nl::digits - 1, 1) == (T{1} << (nl::digits - 1u)));
+  assert(cuda::bitmask<T>(0, nl::digits) == all_ones);
+  return true;
+}
+
+__host__ __device__ constexpr bool test()
+{
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned>();
+  test<unsigned long>();
+  test<unsigned long long>();
+
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<size_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+#if _CCCL_HAS_INT128()
+  test<__uint128_t>();
+#endif // _CCCL_HAS_INT128()
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(test());
+  static_assert(test());
+  return 0;
+}