From 9a42220d78ee5ced4a57a704e58b6a911eaaa55d Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 24 Feb 2025 21:02:02 +0000 Subject: [PATCH 1/3] ARCH/X86: Use UCS function to count leading zeros --- src/ucs/arch/x86_64/cpu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 3776bc7c924..001d8d46145 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -1063,10 +1063,9 @@ size_t ucs_x86_nt_src_buffer_transfer(void *dst, const void *src, size_t len) static UCS_F_ALWAYS_INLINE void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len) { -#if defined (__LZCNT__) __m256i y0, y1, y2, y3; /* Handle lengths that fall usually within eager short range */ - switch (_lzcnt_u32(len)) { + switch (ucs_count_leading_zero_bits(len)) { /* 0 */ case 32: break; @@ -1121,9 +1120,6 @@ void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len) _mm256_storeu_si256(UCS_PTR_BYTE_OFFSET(dst, len - 32), y3); break; } -#else - memcpy(dst, src, len); -#endif } /* This is an adaptation of the memcpy code from https://github.com/amd/aocl-libmem From 58368cbe44b443448c520b0ec97ccaee050a9ac8 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 25 Feb 2025 14:24:01 +0000 Subject: [PATCH 2/3] ARCH/X86: Use UCS function to count leading zeros --- config/m4/compiler.m4 | 18 ++++++++++++++++++ src/ucs/arch/bitops.h | 15 +++++++++++++-- test/gtest/ucs/test_bitops.cc | 20 ++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4 index b87ae6084be..87fa2e48af9 100644 --- a/config/m4/compiler.m4 +++ b/config/m4/compiler.m4 @@ -599,6 +599,24 @@ if $CC --version 2>&1 | grep -q Intel; then [AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])]) fi +# +# Check actual lzcnt support (at least nvc 24.9 fails to link, even if it defines __LZCNT__) +# +SAVE_CFLAGS="$CFLAGS" +CFLAGS="$CFLAGS -mlzcnt" +AC_MSG_CHECKING([if lzcnt is supported]) +AC_LINK_IFELSE([AC_LANG_SOURCE([[ + #include + int main(void) { + return (int)_lzcnt_u32(1) | (int)_lzcnt_u64(2); + } + ]])], + [AC_MSG_RESULT([yes]) + BASE_CFLAGS="-mlzcnt $BASE_CFLAGS" + AC_DEFINE([HAVE_LZCNT], 1, [LZCNT Intrinsic support])], + [AC_MSG_RESULT([no])]) +CFLAGS="$SAVE_CFLAGS" + # # Set C++ optimization/debug flags to be the same as for C diff --git a/src/ucs/arch/bitops.h b/src/ucs/arch/bitops.h index 86799a11182..dd336264b0b 100644 --- a/src/ucs/arch/bitops.h +++ b/src/ucs/arch/bitops.h @@ -28,6 +28,11 @@ BEGIN_C_DECLS #endif +#if defined(HAVE_LZCNT) +# include +#endif + + #define ucs_ilog2(_n) \ ( \ __builtin_constant_p(_n) ? ( \ @@ -121,10 +126,16 @@ BEGIN_C_DECLS ((sizeof(_n) <= 4) ? __builtin_ctz((uint32_t)(_n)) : __builtin_ctzl(_n)) /* Returns the number of leading 0-bits in _n. - * If _n is 0, the result is undefined */ +#if defined(HAVE_LZCNT) #define ucs_count_leading_zero_bits(_n) \ - ((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : __builtin_clzl(_n)) + ((sizeof(_n) <= 4) ? _lzcnt_u32((uint32_t)(_n)) : _lzcnt_u64(_n)) +#else +#define ucs_count_leading_zero_bits(_n) \ + ((_n) ? ((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : \ + __builtin_clzl(_n)) : \ + (sizeof(_n) * 8)) +#endif /* Returns the number of bits lower than 'bit_index' that are set in 'mask' * For example: ucs_bitmap2idx(mask=0xF0, idx=6) returns 2 diff --git a/test/gtest/ucs/test_bitops.cc b/test/gtest/ucs/test_bitops.cc index 7b019589e7a..c1de7f8ecba 100644 --- a/test/gtest/ucs/test_bitops.cc +++ b/test/gtest/ucs/test_bitops.cc @@ -138,6 +138,26 @@ UCS_TEST_F(test_bitops, is_equal) { test_bitops::check_bitwise_equality(buffer1, buffer2, indices, 0); } +template void test_clz() +{ + constexpr int bits = sizeof(T) * 8; + T v = 1; + + for (int i = bits - 1; v != 0; v <<= 1, --i) { + ASSERT_EQ(i, ucs_count_leading_zero_bits(v)); + } + + ASSERT_EQ(bits, ucs_count_leading_zero_bits(v)); +} + +UCS_TEST_F(test_bitops, clz) +{ + test_clz(); + test_clz(); + test_clz(); + test_clz(); +} + template void test_mask() { Type expected = 0; From 1b01dddde2c3b4c007f2b10d95de2afcc28b51f3 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Wed, 26 Feb 2025 10:30:24 +0000 Subject: [PATCH 3/3] ARCH/X86: Use UCS function to count leading zeros --- contrib/test_jenkins.sh | 9 +++------ src/ucs/arch/x86_64/cpu.c | 2 +- test/gtest/ucs/arch/test_x86_64.cc | 17 +++++++++++++++-- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index bcf4d473cd5..3769244d57e 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -1128,12 +1128,9 @@ run_release_mode_tests() { # Run nt_buffer_transfer tests # run_nt_buffer_transfer_tests() { - if lscpu | grep -q 'AuthenticAMD' - then - build release --enable-gtest --enable-optimizations - echo "==== Running nt_buffer_transfer tests ====" - ./test/gtest/gtest --gtest_filter="test_arch.nt_buffer_transfer_*" - fi + build release --enable-gtest --enable-optimizations + echo "==== Running test_arch tests with optimizations ====" + ./test/gtest/gtest --gtest_filter="test_arch.*" } set_ucx_common_test_env() { diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c index 001d8d46145..0fb5e2a3862 100644 --- a/src/ucs/arch/x86_64/cpu.c +++ b/src/ucs/arch/x86_64/cpu.c @@ -1061,7 +1061,7 @@ size_t ucs_x86_nt_src_buffer_transfer(void *dst, const void *src, size_t len) } static UCS_F_ALWAYS_INLINE -void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len) +void ucs_x86_copy_bytes_le_128(void *dst, const void *src, uint32_t len) { __m256i y0, y1, y2, y3; /* Handle lengths that fall usually within eager short range */ diff --git a/test/gtest/ucs/arch/test_x86_64.cc b/test/gtest/ucs/arch/test_x86_64.cc index 1cbab6c40d2..0d3c57b5d9e 100644 --- a/test/gtest/ucs/arch/test_x86_64.cc +++ b/test/gtest/ucs/arch/test_x86_64.cc @@ -76,6 +76,12 @@ class test_arch : public ucs::test { test_window_size = 8 * 1024; hole_size = 2 * align; + auto msg = [&]() { + std::stringstream ss; + ss << "using length=" << len << " src_align=" << i << " dst_align=" << j; + return ss.str(); + }; + /* * Allocate a hole above and below the test_window_size * to check for writes beyond the designated area. @@ -113,14 +119,20 @@ class test_arch : public ucs::test { /* Perform the transfer */ ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint, len); result = memcmp(src + j, dst + i, len); - EXPECT_EQ(0, result); + EXPECT_EQ(0, result) << msg(); + if (result) { + goto terminate; + } /* reset the copied region back to zero */ memset(dst + i, 0x0, len); /* check for any modifications in the holes */ result = memcmp(test_window_dst, dup, total_size); - EXPECT_EQ(0, result); + EXPECT_EQ(0, result) << msg(); + if (result) { + goto terminate; + } } } /* Check for each len for less than 1k sizes @@ -133,6 +145,7 @@ class test_arch : public ucs::test { } } + terminate: free(dup); dup_fail: