From b9e7d38d7ff6c114affcecd440bfa3c5ffcb79dd Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Wed, 5 Mar 2025 12:48:49 -0800 Subject: [PATCH] Extended `` operations: `bitfield_insert`, `bitfield_extract`, `bit_reverse`, `bitmask` (#3941) Co-authored-by: Michael Schellenberger Costa Co-authored-by: Bernhard Manfred Gruber --- docs/libcudacxx/extended_api.rst | 1 + docs/libcudacxx/extended_api/bit.rst | 37 ++++ .../extended_api/bit/bit_reverse.rst | 66 +++++++ .../extended_api/bit/bitfield_extract.rst | 69 ++++++++ .../extended_api/bit/bitfield_insert.rst | 70 ++++++++ docs/libcudacxx/extended_api/bit/bitmask.rst | 69 ++++++++ libcudacxx/include/cuda/__bit/bit_reverse.h | 167 ++++++++++++++++++ libcudacxx/include/cuda/__bit/bitfield.h | 118 +++++++++++++ libcudacxx/include/cuda/__bit/bitmask.h | 84 +++++++++ libcudacxx/include/cuda/bit | 28 +++ libcudacxx/include/cuda/std/__cccl/builtin.h | 16 ++ .../libcudacxx/cuda/bit/bit_reverse.pass.cpp | 62 +++++++ .../libcudacxx/cuda/bit/bitfield.fail.cpp | 33 ++++ .../libcudacxx/cuda/bit/bitfield.pass.cpp | 80 +++++++++ .../test/libcudacxx/cuda/bit/bitmask.pass.cpp | 62 +++++++ 15 files changed, 962 insertions(+) create mode 100644 docs/libcudacxx/extended_api/bit.rst create mode 100644 docs/libcudacxx/extended_api/bit/bit_reverse.rst create mode 100644 docs/libcudacxx/extended_api/bit/bitfield_extract.rst create mode 100644 docs/libcudacxx/extended_api/bit/bitfield_insert.rst create mode 100644 docs/libcudacxx/extended_api/bit/bitmask.rst create mode 100644 libcudacxx/include/cuda/__bit/bit_reverse.h create mode 100644 libcudacxx/include/cuda/__bit/bitfield.h create mode 100644 libcudacxx/include/cuda/__bit/bitmask.h create mode 100644 libcudacxx/include/cuda/bit create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp diff --git a/docs/libcudacxx/extended_api.rst b/docs/libcudacxx/extended_api.rst index 1df995a432f..0c7a5b21a6a 100644 --- a/docs/libcudacxx/extended_api.rst +++ b/docs/libcudacxx/extended_api.rst @@ -6,6 +6,7 @@ Extended API .. toctree:: :maxdepth: 2 + extended_api/bit extended_api/execution_model extended_api/memory_model extended_api/thread_groups diff --git a/docs/libcudacxx/extended_api/bit.rst b/docs/libcudacxx/extended_api/bit.rst new file mode 100644 index 00000000000..7e433913089 --- /dev/null +++ b/docs/libcudacxx/extended_api/bit.rst @@ -0,0 +1,37 @@ +.. _libcudacxx-extended-api-bit: + +Bit +=== + +.. toctree:: + :hidden: + :maxdepth: 1 + + cuda::bitmask + cuda::bit_reverse + cuda::bitfield_insert + cuda::bitfield_extract + +.. list-table:: + :widths: 25 45 30 30 + :header-rows: 0 + + * - :ref:`bitmask ` + - Generate a bitmask + - CCCL 3.0.0 + - CUDA 13.0 + + * - :ref:`bit_reverse ` + - Reverse the order of bits + - CCCL 3.0.0 + - CUDA 13.0 + + * - :ref:`bitfield_insert ` + - Insert a bitfield + - CCCL 3.0.0 + - CUDA 13.0 + + * - :ref:`bitfield_extract ` + - Extract a bitfield + - CCCL 3.0.0 + - CUDA 13.0 diff --git a/docs/libcudacxx/extended_api/bit/bit_reverse.rst b/docs/libcudacxx/extended_api/bit/bit_reverse.rst new file mode 100644 index 00000000000..63dd7b3fcd6 --- /dev/null +++ b/docs/libcudacxx/extended_api/bit/bit_reverse.rst @@ -0,0 +1,66 @@ +.. _libcudacxx-extended-api-bit-bit_reverse: + +``bit_reverse`` +=============== + +.. code:: cpp + + template + [[nodiscard]] constexpr T + bit_reverse(T value) noexcept; + +The function reverses the order of bits in a value. + +**Parameters** + +- ``value``: Input value + +**Return value** + +- Value with reversed bits + +**Mandates** + +- ``T`` is an unsigned integer type. + +**Performance considerations** + +The function performs the following operations: + +- Device: + + - ``uint8_t`` ``uint16_t``: ``PRMT``, ``BREV`` + - ``uint32_t``: ``BREV`` + - ``uint64_t``: ``BREV`` x2, ``MOV`` x2 + - ``uint128_t``: ``BREV`` x4, ``MOV`` x4 + +- Host: ``__builtin_bitreverse`` with clang + +.. note:: + + When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal. + +.. note:: + + GCC <= 8 uses a slow path with more instructions even in CUDA + +Example +------- + +.. code:: cpp + + #include + #include + + __global__ void bit_reverse_kernel() { + assert(bitfield_reverse(0u) == ~0u); + assert(bitfield_reverse(uint8_t{0b00001011}) == uint8_t{0b11010000}); + } + + int main() { + bit_reverse_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ diff --git a/docs/libcudacxx/extended_api/bit/bitfield_extract.rst b/docs/libcudacxx/extended_api/bit/bitfield_extract.rst new file mode 100644 index 00000000000..2d9d59c72ff --- /dev/null +++ b/docs/libcudacxx/extended_api/bit/bitfield_extract.rst @@ -0,0 +1,69 @@ +.. _libcudacxx-extended-api-bit-bitfield_extract: + +``bitfield_extract`` +==================== + +.. code:: cpp + + template + [[nodiscard]] constexpr T + bitfield_extract(T value, int start, int width) noexcept; + +The function extracts a bitfield from a value and returns it in the lower bits. +``bitfield_extract()`` computes ``(value >> start) & bitfield``, where ``bitfield`` is a sequence of bits of width ``width``. + +**Parameters** + +- ``value``: The value to apply the bitfield. +- ``start``: Initial position of the bitfield. +- ``width``: Width of the bitfield. + +**Return value** + +- ``(value >> start) & bitfield``. + +**Mandates** + +- ``T`` is an unsigned integer type. + +**Preconditions** + + - ``start >= 0 && start < num_bits(T)`` + - ``width > 0 && width <= num_bits(T)`` + - ``start + width <= num_bits(T)`` + +**Performance considerations** + +The function performs the following operations in CUDA for ``uint8_t``, ``uint16_t``, ``uint32_t``: + +- ``SM < 70``: ``BFE`` +- ``SM >= 70``: ``BMSK``, bitwise operation x2 + +.. note:: + + When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal. + +.. note:: + + GCC <= 8 uses a slow path with more instructions even in CUDA + +Example +------- + +.. code:: cpp + + #include + #include + + __global__ void bitfield_insert_kernel() { + assert(cuda::bitfield_extract(~0u, 0, 4) == 0b1111); + assert(cuda::bitfield_extract(0b1011000, 3, 4) == 0b1011); + } + + int main() { + bitfield_insert_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ diff --git a/docs/libcudacxx/extended_api/bit/bitfield_insert.rst b/docs/libcudacxx/extended_api/bit/bitfield_insert.rst new file mode 100644 index 00000000000..23015d65f05 --- /dev/null +++ b/docs/libcudacxx/extended_api/bit/bitfield_insert.rst @@ -0,0 +1,70 @@ +.. _libcudacxx-extended-api-bit-bitfield_insert: + +``bitfield_insert`` +=================== + +.. code:: cpp + + template + [[nodiscard]] constexpr T + bitfield_insert(T dest, T source, int start, int width) noexcept; + +The function extracts the lower bitfield of size ``width`` from ``source`` and inserts it into ``dest`` at position ``start``. + +**Parameters** + +- ``dest``: The value to insert the bitfield. +- ``source``: The value from which extract the bitfield. +- ``start``: Initial position of the bitfield. +- ``width``: Width of the bitfield. + +**Return value** + +- ``((value << start) & mask) | (source & ~mask)``, where ``mask`` is a bitmask of width ``width``. + +**Mandates** + +- ``T`` is an unsigned integer type. + +**Preconditions** + + - ``start >= 0 && start < num_bits(T)`` + - ``width > 0 && width <= num_bits(T)`` + - ``start + width <= num_bits(T)`` + +**Performance considerations** + +The function performs the following operations in CUDA for ``uint8_t``, ``uint16_t``, ``uint32_t``: + +- ``SM < 70``: ``BFI`` +- ``SM >= 70``: ``BMSK``, bitwise operation x5 + +.. note:: + + When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal. + +.. note:: + + GCC <= 8 uses a slow path with more instructions even in CUDA + +Example +------- + +.. code:: cpp + + #include + #include + + __global__ void bitfield_insert_kernel() { + assert(cuda::bitfield_insert(0u, 0xFFFFu, 0, 4) == 0b1111); + assert(cuda::bitfield_insert(0u, 0xFFFFu, 3, 4) == 0b1111000); + assert(cuda::bitfield_insert(1u, 0xFFFFu, 3, 4) == 0b1111001); + } + + int main() { + bitfield_insert_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ diff --git a/docs/libcudacxx/extended_api/bit/bitmask.rst b/docs/libcudacxx/extended_api/bit/bitmask.rst new file mode 100644 index 00000000000..0308645df95 --- /dev/null +++ b/docs/libcudacxx/extended_api/bit/bitmask.rst @@ -0,0 +1,69 @@ +.. _libcudacxx-extended-api-bit-bitmask: + +``bitmask`` +=========== + +.. code:: cpp + + template + [[nodiscard]] constexpr T + bitmask(int start, int width) noexcept; + +The function generates a bitmask of size ``width`` starting at position ``start``. + +**Parameters** + +- ``start``: starting position of the bitmask +- ``width``: width of the bitmask + +**Return value** + +- Bitmask of size ``width`` starting at ``start`` + +**Mandates** + +- ``T`` is an unsigned integral type. + +**Preconditions** + + - ``start >= 0 && start < num_bits(T)`` + - ``width > 0 && width <= num_bits(T)`` + - ``start + width <= num_bits(T)`` + +**Performance considerations** + +The function performs the following operations in device code: + +- ``uint8_t``, ``uint16_t``, ``uint32_t``: ``BMSK`` +- ``uint64_t``: ``SHL`` x4, ``UADD`` x2 +- ``uint128_t``: ``SHL`` x8, ``UADD`` x4 + +.. note:: + + When the input values are run-time values that the compiler can resolve at compile-time, e.g. an index of a loop with a fixed number of iterations, using the function could not be optimal. + +.. note:: + + GCC <= 8 uses a slow path with more instructions even in CUDA + +Example +------- + +.. code:: cpp + + #include + #include + #include + + __global__ void bitmask_kernel() { + assert(cuda::bitmask(2, 4) == 0b111100u); + assert(cuda::bitmask(1, 3) == uint64_t{0b1110}); + } + + int main() { + bitmask_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ diff --git a/libcudacxx/include/cuda/__bit/bit_reverse.h b/libcudacxx/include/cuda/__bit/bit_reverse.h new file mode 100644 index 00000000000..1c02074dbe9 --- /dev/null +++ b/libcudacxx/include/cuda/__bit/bit_reverse.h @@ -0,0 +1,167 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BIT_BIT_REVERSE_H +#define _CUDA___BIT_BIT_REVERSE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +#if defined(_CCCL_BUILTIN_BITREVERSE32) + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __bit_reverse_builtin(_Tp __value) noexcept +{ +# if _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(__uint128_t)) + { + auto __high = static_cast<__uint128_t>(_CCCL_BUILTIN_BITREVERSE64(static_cast(__value))) << 64; + auto __low = static_cast<__uint128_t>(_CCCL_BUILTIN_BITREVERSE64(static_cast(__value >> 64))); + return __high | __low; + } +# endif // _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(uint64_t)) + { + return _CCCL_BUILTIN_BITREVERSE64(__value); + } + else if constexpr (sizeof(_Tp) == sizeof(uint32_t)) + { + return _CCCL_BUILTIN_BITREVERSE32(__value); + } + else if constexpr (sizeof(_Tp) == sizeof(uint16_t)) + { + return _CCCL_BUILTIN_BITREVERSE16(__value); + } + else + { + return _CCCL_BUILTIN_BITREVERSE8(__value); + } +} + +#endif // defined(_CCCL_BUILTIN_BITREVERSE32) + +#if _CCCL_HAS_CUDA_COMPILER + +template +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE constexpr _Tp __bit_reverse_device(_Tp __value) noexcept +{ +# if _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(__uint128_t)) + { + auto __high = static_cast<__uint128_t>(::cuda::__bit_reverse_device(static_cast(__value))) << 64; + auto __low = static_cast<__uint128_t>(::cuda::__bit_reverse_device(static_cast(__value >> 64))); + return __high | __low; + } +# endif // _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(uint64_t)) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __brevll(__value);)) + } + else if constexpr (sizeof(_Tp) == sizeof(uint32_t)) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __brev(__value);)) + } + else if constexpr (sizeof(_Tp) == sizeof(uint16_t)) + { + NV_IF_TARGET(NV_IS_DEVICE, (return __brev(static_cast(__value) << 16);)) + } + else + { + NV_IF_TARGET(NV_IS_DEVICE, (return __brev(static_cast(__value) << 24);)) + } + _CCCL_UNREACHABLE(); +} + +#endif // _CCCL_HAS_CUDA_COMPILER + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __bit_reverse_generic(_Tp __value) noexcept +{ +#if _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(__uint128_t)) + { + constexpr auto __c1 = __uint128_t{0x5555555555555555} << 64 | uint64_t{0x5555555555555555}; + constexpr auto __c2 = __uint128_t{0x3333333333333333} << 64 | uint64_t{0x3333333333333333}; + constexpr auto __c3 = __uint128_t{0x0F0F0F0F0F0F0F0F} << 64 | uint64_t{0x0F0F0F0F0F0F0F0F}; + constexpr auto __c4 = __uint128_t{0x00FF00FF00FF00FF} << 64 | uint64_t{0x00FF00FF00FF00FF}; + constexpr auto __c5 = __uint128_t{0x0000FFFF0000FFFF} << 64 | uint64_t{0x0000FFFF0000FFFF}; + constexpr auto __c6 = __uint128_t{0x00000000FFFFFFFF} << 64 | uint64_t{0x00000000FFFFFFFF}; + __value = ((__value >> 1) & __c1) | ((__value & __c1) << 1); + __value = ((__value >> 2) & __c2) | ((__value & __c2) << 2); + __value = ((__value >> 4) & __c3) | ((__value & __c3) << 4); + __value = ((__value >> 8) & __c4) | ((__value & __c4) << 8); + __value = ((__value >> 16) & __c5) | ((__value & __c5) << 16); + __value = ((__value >> 32) & __c6) | ((__value & __c6) << 32); + return (__value >> 64) | (__value << 64); + } +#endif // _CCCL_HAS_INT128() + if constexpr (sizeof(_Tp) == sizeof(uint64_t)) + { + __value = ((__value >> 1) & 0x5555555555555555) | ((__value & 0x5555555555555555) << 1); + __value = ((__value >> 2) & 0x3333333333333333) | ((__value & 0x3333333333333333) << 2); + __value = ((__value >> 4) & 0x0F0F0F0F0F0F0F0F) | ((__value & 0x0F0F0F0F0F0F0F0F) << 4); + __value = ((__value >> 8) & 0x00FF00FF00FF00FF) | ((__value & 0x00FF00FF00FF00FF) << 8); + __value = ((__value >> 16) & 0x0000FFFF0000FFFF) | ((__value & 0x0000FFFF0000FFFF) << 16); + return (__value >> 32) | (__value << 32); + } + else if constexpr (sizeof(_Tp) == sizeof(uint32_t)) + { + __value = ((__value >> 1) & 0x55555555) | ((__value & 0x55555555) << 1); + __value = ((__value >> 2) & 0x33333333) | ((__value & 0x33333333) << 2); + __value = ((__value >> 4) & 0x0F0F0F0F) | ((__value & 0x0F0F0F0F) << 4); + __value = ((__value >> 8) & 0x00FF00FF) | ((__value & 0x00FF00FF) << 8); + return (__value >> 16) | (__value << 16); + } + else if constexpr (sizeof(_Tp) == sizeof(uint16_t)) + { + __value = ((__value >> 1) & 0x5555) | ((__value & 0x5555) << 1); + __value = ((__value >> 2) & 0x3333) | ((__value & 0x3333) << 2); + __value = ((__value >> 4) & 0x0F0F) | ((__value & 0x0F0F) << 4); + return (__value >> 8) | (__value << 8); + } + else + { + __value = ((__value >> 1) & 0x55) | ((__value & 0x55) << 1); + __value = ((__value >> 2) & 0x33) | ((__value & 0x33) << 2); + return (__value >> 4) | (__value << 4); + } +} + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bit_reverse(_Tp __value) noexcept +{ + static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bit_reverse() requires unsigned integer types"); + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::__bit_reverse_device(__value);)) + } +#if defined(_CCCL_BUILTIN_BITREVERSE32) + return ::cuda::__bit_reverse_builtin(__value); +#else + return ::cuda::__bit_reverse_generic(__value); +#endif +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BIT_BIT_REVERSE_H diff --git a/libcudacxx/include/cuda/__bit/bitfield.h b/libcudacxx/include/cuda/__bit/bitfield.h new file mode 100644 index 00000000000..b39e623c497 --- /dev/null +++ b/libcudacxx/include/cuda/__bit/bitfield.h @@ -0,0 +1,118 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BIT_BITFILED_INSERT_EXTRACT_H +#define _CUDA___BIT_BITFILED_INSERT_EXTRACT_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +#if __cccl_ptx_isa >= 200 + +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint32_t +__bfi(uint32_t __dest, uint32_t __source, int __start, int __width) noexcept +{ + asm("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(__dest) : "r"(__source), "r"(__dest), "r"(__start), "r"(__width)); + return __dest; +} + +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint64_t +__bfi(uint64_t __dest, uint64_t __source, int __start, int __width) noexcept +{ + asm("bfi.b64 %0, %1, %2, %3, %4;" : "=l"(__dest) : "l"(__source), "l"(__dest), "r"(__start), "r"(__width)); + return __dest; +} + +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint32_t __bfe(uint32_t __value, int __start, int __width) noexcept +{ + uint32_t __ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(__ret) : "r"(__value), "r"(__start), "r"(__width)); + return __ret; +} + +_CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE uint64_t __bfe(uint64_t __value, int __start, int __width) noexcept +{ + uint64_t __ret; + asm("bfe.u64 %0, %1, %2, %3;" : "=l"(__ret) : "l"(__value), "r"(__start), "r"(__width)); + return __ret; +} + +#endif // __cccl_ptx_isa >= 200 + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp +bitfield_insert(const _Tp __dest, const _Tp __source, int __start, int __width) noexcept +{ + static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitfield_insert() requires unsigned integer types"); + constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits; + _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range"); + _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range"); + _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range"); + if constexpr (sizeof(_Tp) <= sizeof(uint64_t)) + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // clang-format off + NV_DISPATCH_TARGET( // all SM < 70 + NV_PROVIDES_SM_70, (;), + NV_IS_DEVICE, (using _Up = _CUDA_VSTD::_If; + return ::cuda::__bfi(static_cast<_Up>(__dest), static_cast<_Up>(__source), + __start, __width);)) + // clang-format on + } + } + auto __mask = ::cuda::bitmask<_Tp>(__start, __width); + return ((__source << __start) & __mask) | (__dest & ~__mask); +} + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp +bitfield_extract(const _Tp __value, int __start, int __width) noexcept +{ + static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitfield_extract() requires unsigned integer types"); + constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits; + _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range"); + _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range"); + _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range"); + if constexpr (sizeof(_Tp) <= sizeof(uint32_t)) + { + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + // clang-format off + NV_DISPATCH_TARGET( // all SM < 70 + NV_PROVIDES_SM_70, (;), + NV_IS_DEVICE, (using _Up = _CUDA_VSTD::_If; + return ::cuda::__bfe(static_cast<_Up>(__value), __start, __width);)) + // clang-format on + } + } + return ((__value >> __start) & ::cuda::bitmask<_Tp>(0, __width)); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BIT_BITFILED_INSERT_EXTRACT_H diff --git a/libcudacxx/include/cuda/__bit/bitmask.h b/libcudacxx/include/cuda/__bit/bitmask.h new file mode 100644 index 00000000000..780847ae4c9 --- /dev/null +++ b/libcudacxx/include/cuda/__bit/bitmask.h @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BIT_BITMASK_H +#define _CUDA___BIT_BITMASK_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp __shl(const _Tp __value, int __shift) noexcept +{ + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + if constexpr (sizeof(_Tp) <= sizeof(uint64_t)) + { + NV_DISPATCH_TARGET(NV_IS_DEVICE, + (using _Up = _CUDA_VSTD::_If; + return _CUDA_VPTX::shl(static_cast<_Up>(__value), __shift);)) + } +#if _CCCL_HAS_INT128() + else + { + // the compiler should generate exactly four 32-bit shl instructions + NV_DISPATCH_TARGET(NV_IS_DEVICE, + (auto __low = _CUDA_VPTX::shl(static_cast(__value), __shift); + auto __high = _CUDA_VPTX::shl(static_cast(__value >> 64), __shift); + return __low | (static_cast<__uint128_t>(__high) << 64);)) + } +#endif // _CCCL_HAS_INT128() + } + constexpr auto __all_ones = static_cast<_Tp>(~_Tp{0}); + return (__shift == _CUDA_VSTD::numeric_limits<_Tp>::digits) ? __all_ones : __value << __shift; +} + +template +_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI constexpr _Tp bitmask(int __start, int __width) noexcept +{ + static_assert(_CUDA_VSTD::__cccl_is_unsigned_integer_v<_Tp>, "bitmask() requires unsigned integer types"); + constexpr auto __digits = _CUDA_VSTD::numeric_limits<_Tp>::digits; + _CCCL_ASSERT(__width > 0 && __width <= __digits, "width out of range"); + _CCCL_ASSERT(__start >= 0 && __start < __digits, "start position out of range"); + _CCCL_ASSERT(__start + __width <= __digits, "start position + width out of range"); + if (!_CUDA_VSTD::__cccl_default_is_constant_evaluated()) + { + if constexpr (sizeof(_Tp) <= sizeof(uint32_t)) + { + NV_IF_TARGET(NV_PROVIDES_SM_70, (return _CUDA_VPTX::bmsk_clamp(__start, __width);)) + } + else + { + NV_IF_TARGET(NV_IS_DEVICE, (return (::cuda::__shl(_Tp{1}, __width) - 1) << __start;)) + } + } + constexpr auto __all_ones = static_cast<_Tp>(~_Tp{0}); + return __width == __digits ? __all_ones : (static_cast<_Tp>((_Tp{1} << __width) - 1) << __start); +} + +_LIBCUDACXX_END_NAMESPACE_CUDA + +#endif // _CUDA___BIT_BITMASK_H diff --git a/libcudacxx/include/cuda/bit b/libcudacxx/include/cuda/bit new file mode 100644 index 00000000000..82f0bc840e8 --- /dev/null +++ b/libcudacxx/include/cuda/bit @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_BIT +#define _CUDA_BIT + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#endif // _CUDA_BIT diff --git a/libcudacxx/include/cuda/std/__cccl/builtin.h b/libcudacxx/include/cuda/std/__cccl/builtin.h index 5377777751a..4649a628dfc 100644 --- a/libcudacxx/include/cuda/std/__cccl/builtin.h +++ b/libcudacxx/include/cuda/std/__cccl/builtin.h @@ -237,6 +237,22 @@ # undef _CCCL_BUILTIN_BSWAP128 #endif // _CCCL_CUDA_COMPILER(NVCC) +#if _CCCL_CHECK_BUILTIN(builtin_bitreverse8) && !_CCCL_HAS_CUDA_COMPILER +# define _CCCL_BUILTIN_BITREVERSE8(...) __builtin_bitreverse8(__VA_ARGS__) +#endif + +#if _CCCL_CHECK_BUILTIN(builtin_bitreverse16) && !_CCCL_HAS_CUDA_COMPILER +# define _CCCL_BUILTIN_BITREVERSE16(...) __builtin_bitreverse16(__VA_ARGS__) +#endif + +#if _CCCL_CHECK_BUILTIN(builtin_bitreverse32) && !_CCCL_HAS_CUDA_COMPILER +# define _CCCL_BUILTIN_BITREVERSE32(...) __builtin_bitreverse32(__VA_ARGS__) +#endif + +#if _CCCL_CHECK_BUILTIN(builtin_bitreverse64) && !_CCCL_HAS_CUDA_COMPILER +# define _CCCL_BUILTIN_BITREVERSE64(...) __builtin_bitreverse64(__VA_ARGS__) +#endif + #if _CCCL_CHECK_BUILTIN(builtin_cbrt) || _CCCL_COMPILER(GCC) # define _CCCL_BUILTIN_CBRTF(...) __builtin_cbrtf(__VA_ARGS__) # define _CCCL_BUILTIN_CBRT(...) __builtin_cbrt(__VA_ARGS__) diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp new file mode 100644 index 00000000000..d25ed67044e --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/bit/bit_reverse.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr bool test() +{ + using nl = cuda::std::numeric_limits; + constexpr T all_ones = static_cast(~T{0}); + constexpr T half_low = all_ones >> (nl::digits / 2u); + constexpr T half_high = static_cast(all_ones << (nl::digits / 2u)); + static_assert(cuda::bit_reverse(all_ones) == all_ones); + static_assert(cuda::bit_reverse(T{0}) == T{0}); + static_assert(cuda::bit_reverse(half_low) == half_high); + static_assert(cuda::bit_reverse(T{0b11001001}) == (T{0b10010011} << (nl::digits - 8u))); + static_assert(cuda::bit_reverse(T{T{0b10010011} << (nl::digits - 8u)}) == T{0b11001001}); + unused(all_ones); + unused(half_low); + unused(half_high); + return true; +} + +__host__ __device__ constexpr bool test() +{ + test(); + test(); + test(); + test(); + test(); + + test(); + test(); + test(); + test(); + test(); + test(); + test(); + +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp new file mode 100644 index 00000000000..1e4f4bd6efb --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.fail.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +int main(int, char**) +{ + using T = uint32_t; + static_assert(cuda::bitfield_insert(int{0}, int{0}, 0, 0)); + static_assert(cuda::bitfield_insert(T{0}, T{0}, -1, 0)); + static_assert(cuda::bitfield_insert(T{0}, T{0}, 0, -1)); + static_assert(cuda::bitfield_insert(T{0}, T{0}, 0, 33)); + static_assert(cuda::bitfield_insert(T{0}, T{0}, 32, 0)); + static_assert(cuda::bitfield_insert(T{0}, T{0}, 20, 20)); + + static_assert(cuda::bitfield_extract(T{0}, -1, 0)); + static_assert(cuda::bitfield_extract(T{0}, 0, -1)); + static_assert(cuda::bitfield_extract(T{0}, 0, 33)); + static_assert(cuda::bitfield_extract(T{0}, 32, 0)); + static_assert(cuda::bitfield_extract(T{0}, 20, 20)); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp new file mode 100644 index 00000000000..5099b7b87d2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/bit/bitfield.pass.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr bool test() +{ + using nl = cuda::std::numeric_limits; + constexpr T all_ones = static_cast(~T{0}); + unused(all_ones); + assert(cuda::bitfield_insert(T{0}, all_ones, 0, 1) == 1); + assert(cuda::bitfield_insert(T{0}, all_ones, 1, 1) == 0b10); + assert(cuda::bitfield_insert(T{0b10}, all_ones, 0, 1) == 0b11); + assert(cuda::bitfield_insert(all_ones, all_ones, 0, 1) == all_ones); + assert(cuda::bitfield_insert(all_ones, all_ones, 2, 1) == all_ones); + assert(cuda::bitfield_insert(all_ones, T{0b1000}, 1, 2) == (all_ones & static_cast(~T{0b110}))); + + assert(cuda::bitfield_insert(T{0}, all_ones, 0, 2) == 0b11); + assert(cuda::bitfield_insert(T{0}, all_ones, 3, 2) == 0b11000); + assert(cuda::bitfield_insert(T{0b10100000}, all_ones, 3, 2) == 0b10111000); + assert(cuda::bitfield_insert(T{0b10100000}, T{0b11}, 3, 2) == 0b10111000); + assert(cuda::bitfield_insert(T{0}, all_ones, nl::digits - 1, 1) == (T{1} << (nl::digits - 1u))); + assert(cuda::bitfield_insert(T{0b10100000}, all_ones, 0, nl::digits) == all_ones); + + assert(cuda::bitfield_extract(T{0}, 3, 4) == 0); + assert(cuda::bitfield_extract(T{0b1011}, 0, 1) == 1); + assert(cuda::bitfield_extract(T{0b1011}, 1, 1) == 1); + assert(cuda::bitfield_extract(T{0b1011}, 2, 2) == 0b10); + assert(cuda::bitfield_extract(all_ones, 0, 4) == 0b1111); + assert(cuda::bitfield_extract(all_ones, 2, 4) == 0b1111); + + assert(cuda::bitfield_extract(T{0b1010010}, 0, 2) == 0b10); + assert(cuda::bitfield_extract(T{0b10101100}, 3, 2) == 1); + assert(cuda::bitfield_extract(T{0b10100000}, 3, 3) == 0b100); + + assert(cuda::bitfield_extract(T{all_ones}, nl::digits - 1, 1) == 1); + assert(cuda::bitfield_extract(T{0b10100000}, 0, nl::digits) == T{0b10100000}); + return true; +} + +__host__ __device__ constexpr bool test() +{ + test(); + test(); + test(); + test(); + test(); + + test(); + test(); + test(); + test(); + test(); + test(); + test(); + +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp new file mode 100644 index 00000000000..5ec6960e09d --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/bit/bitmask.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ constexpr bool test() +{ + using nl = cuda::std::numeric_limits; + constexpr T all_ones = static_cast(~T{0}); + unused(all_ones); + assert(cuda::bitmask(0, 1) == 1); + assert(cuda::bitmask(1, 1) == 0b10); + assert(cuda::bitmask(0, 2) == 0b11); + assert(cuda::bitmask(2, 2) == 0b1100); + + assert(cuda::bitmask(0, 2) == 0b11); + assert(cuda::bitmask(3, 2) == 0b11000); + assert(cuda::bitmask(nl::digits - 1, 1) == (T{1} << (nl::digits - 1u))); + assert(cuda::bitmask(0, nl::digits) == all_ones); + return true; +} + +__host__ __device__ constexpr bool test() +{ + test(); + test(); + test(); + test(); + test(); + + test(); + test(); + test(); + test(); + test(); + test(); + test(); + +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +}