https://github.com/gchatelet updated https://github.com/llvm/llvm-project/pull/77081
>From fb8dbd55aacb3a25678b8092a11dd4e562857344 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Fri, 5 Jan 2024 11:01:30 +0000 Subject: [PATCH 1/6] [libc] Fix buggy AVX2 `memcmp` Fixes 77080. --- libc/src/string/memory_utils/op_x86.h | 33 ++++++++++++++++++++++----- libc/test/src/string/memcmp_test.cpp | 7 ++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 1a20659c178cd1..23e6b897997e90 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) { 8, 9, 10, 11, 12, 13, 14, 15)); } LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { - return static_cast<uint16_t>(_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); + return static_cast<uint16_t>( + _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)))); } template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { const auto a = load<__m128i>(p1, offset); @@ -181,11 +182,31 @@ LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { return _mm256_max_epu8(a, b); } LIBC_INLINE __m256i bytewise_reverse(__m256i value) { - return _mm256_shuffle_epi8(value, - _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // - 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, // - 24, 25, 26, 27, 28, 29, 30, 31)); + const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 24, 25, 26, 27, 28, 29, 30, 31); +#if defined(__AVX512VBMI__) && defined(__AVX512VL__) + // AVX512 allows full __m256i byte permutation. + // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, + // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + return _mm256_permutexvar_epi8(value, indices); +#else + // We can't byte-reverse __m256i in a single instruction with AVX2. + // '_mm256_shuffle_epi8' can only shuffle within each xmm lane + // leading to: + // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] + const __m256i tmp = _mm256_shuffle_epi8(value, indices); + // Then we shuffle accross lanes using 64 bit values. + // ymm = ymm[2,3,0,1] + // Leading to a fully reversed vector + // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, + // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + // The immediate encodes the 64 bit word indices : 1, 0, 3, 2. + // Each index is encoded with 2 bits : 0b01'00'11'10. + return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10); +#endif } LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value))); diff --git a/libc/test/src/string/memcmp_test.cpp b/libc/test/src/string/memcmp_test.cpp index 03a0ac1c0ba655..a69257704a64a2 100644 --- a/libc/test/src/string/memcmp_test.cpp +++ b/libc/test/src/string/memcmp_test.cpp @@ -37,6 +37,13 @@ TEST(LlvmLibcMemcmpTest, LhsAfterRhsLexically) { EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 2), 0); } +TEST(LlvmLibcMemcmpTest, Issue77080) { + // https://github.com/llvm/llvm-project/issues/77080 + constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220"; + constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3"; + EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 0); +} + // Adapt CheckMemcmp signature to memcmp. static inline int Adaptor(cpp::span<char> p1, cpp::span<char> p2, size_t size) { return LIBC_NAMESPACE::memcmp(p1.begin(), p2.begin(), size); >From 04891668ef388ed354d9a18969ada20d54371ce6 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Mon, 8 Jan 2024 08:51:12 +0000 Subject: [PATCH 2/6] Make test clearer --- libc/test/src/string/memcmp_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/test/src/string/memcmp_test.cpp b/libc/test/src/string/memcmp_test.cpp index a69257704a64a2..ca7a5c7ce37023 100644 --- a/libc/test/src/string/memcmp_test.cpp +++ b/libc/test/src/string/memcmp_test.cpp @@ -41,7 +41,7 @@ TEST(LlvmLibcMemcmpTest, Issue77080) { // https://github.com/llvm/llvm-project/issues/77080 constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220"; constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3"; - EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 0); + ASSERT_GE(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 1); } // Adapt CheckMemcmp signature to memcmp. >From 0c76eb213447ad4b2eb77e040053867b8d9d5505 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Mon, 8 Jan 2024 08:51:18 +0000 Subject: [PATCH 3/6] Fix typo --- libc/src/string/memory_utils/op_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 23e6b897997e90..7313f67fd647bf 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -198,7 +198,7 @@ LIBC_INLINE __m256i bytewise_reverse(__m256i value) { // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] const __m256i tmp = _mm256_shuffle_epi8(value, indices); - // Then we shuffle accross lanes using 64 bit values. + // Then we shuffle across lanes using 64 bit values. // ymm = ymm[2,3,0,1] // Leading to a fully reversed vector // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, >From 2ca47472277a349f3e28834cc3918565fc00ccea Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Mon, 8 Jan 2024 16:05:45 +0000 Subject: [PATCH 4/6] Use nafi3000 suggestion and fix avx512 version as well. --- libc/src/string/memory_utils/op_x86.h | 102 +++++++++++++++++--------- 1 file changed, 66 insertions(+), 36 deletions(-) diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 7313f67fd647bf..96988f8093b193 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -181,36 +181,42 @@ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { return _mm256_max_epu8(a, b); } -LIBC_INLINE __m256i bytewise_reverse(__m256i value) { - const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // - 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, // - 24, 25, 26, 27, 28, 29, 30, 31); +LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { + // Bytewise comparison of 'max' and 'value'. + const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value); + // Because x86 is little endian, bytes in the vector must be reversed before + // using movemask. #if defined(__AVX512VBMI__) && defined(__AVX512VL__) - // AVX512 allows full __m256i byte permutation. - // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, - // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - return _mm256_permutexvar_epi8(value, indices); + // When AVX512BMI is available we can completely reverse the vector through + // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a); + const __m256i big_endian_byte_mask = + _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 24, 25, 26, 27, 28, 29, 30, 31), + little_endian_byte_mask); + // And turn the byte vector mask into an 'uint32_t' for direct scalar + // comparison. + return _mm256_movemask_epi8(big_endian_byte_mask); #else - // We can't byte-reverse __m256i in a single instruction with AVX2. - // '_mm256_shuffle_epi8' can only shuffle within each xmm lane + // We can't byte-reverse '__m256i' in a single instruction with AVX2. + // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane // leading to: // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16] - const __m256i tmp = _mm256_shuffle_epi8(value, indices); - // Then we shuffle across lanes using 64 bit values. - // ymm = ymm[2,3,0,1] - // Leading to a fully reversed vector - // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, - // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - // The immediate encodes the 64 bit word indices : 1, 0, 3, 2. - // Each index is encoded with 2 bits : 0b01'00'11'10. - return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10); + // So we first shuffle each 16-byte lane leading to half-reversed vector mask. + const __m256i half_reversed = _mm256_shuffle_epi8( + little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15)); + // Then we turn the vector into an uint32_t. + const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed); + // And swap the lower and upper parts. This is optimized into a single `rorx` + // instruction. + return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16); #endif } -LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { - return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value))); -} template <> LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { const auto a = load<__m256i>(p1, offset); @@ -219,7 +225,7 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { const auto le = big_endian_cmp_mask(vmax, b); const auto ge = big_endian_cmp_mask(vmax, a); static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>); - return cmp_uint32_t(ge, le); + return cmp_neq_uint64_t(ge, le); } #endif // __AVX2__ @@ -231,19 +237,43 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { return _mm512_max_epu8(a, b); } -LIBC_INLINE __m512i bytewise_reverse(__m512i value) { - return _mm512_shuffle_epi8(value, - _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // - 8, 9, 10, 11, 12, 13, 14, 15, // - 16, 17, 18, 19, 20, 21, 22, 23, // - 24, 25, 26, 27, 28, 29, 30, 31, // - 32, 33, 34, 35, 36, 37, 38, 39, // - 40, 41, 42, 43, 44, 45, 46, 47, // - 48, 49, 50, 51, 52, 53, 54, 55, // - 56, 57, 58, 59, 60, 61, 62, 63)); -} LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { - return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value)); +#if defined(__AVX512VBMI__) + // When AVX512BMI is available we can completely reverse the vector through + // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); + const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, // + 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, // + 56, 57, 58, 59, 60, 61, 62, 63); + // Then we compute the mask for equal bytes. + return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // + _mm512_permutexvar_epi8(indices, value)); +#else + // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. + // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. + // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | + // So we only reverse groups of 8 bytes, these groups are necessarily within a + // 16-byte lane. + const __m512i indices = _mm512_set_epi8(56, 57, 58, 59, 60, 61, 62, 63, // + 48, 49, 50, 51, 52, 53, 54, 55, // + 40, 41, 42, 43, 44, 45, 46, 47, // + 32, 33, 34, 35, 36, 37, 38, 39, // + 24, 25, 26, 27, 28, 29, 30, 31, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7); + // Then we compute the mask for equal bytes. In this mask the bits of each + // byte are already reversed but the byte themselves should be reversed, this + // is done by using a bswap instruction. + return __builtin_bswap64( + _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), // + _mm512_shuffle_epi8(value, indices))); + +#endif } template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { const auto a = load<__m512i>(p1, offset); >From de6e7b662db682d8ea72a39205a639290180be55 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Tue, 9 Jan 2024 16:06:02 +0000 Subject: [PATCH 5/6] Disable AVX512BMI version for now --- libc/src/string/memory_utils/op_x86.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 96988f8093b193..6ad3fba0df032d 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -238,7 +238,11 @@ LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { return _mm512_max_epu8(a, b); } LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { -#if defined(__AVX512VBMI__) + // The AVX512BMI version is disabled due to bad codegen. + // https://github.com/llvm/llvm-project/issues/77459 + // https://github.com/llvm/llvm-project/pull/77081 + // TODO: Re-enable when clang version meets the fixed version. +#if false && defined(__AVX512VBMI__) // When AVX512BMI is available we can completely reverse the vector through // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a); const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // >From a49f014e9030c1578c4143d48882d09c7c152da0 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet <gchate...@google.com> Date: Tue, 9 Jan 2024 16:30:28 +0000 Subject: [PATCH 6/6] Reformulate documentation --- libc/src/string/memory_utils/op_x86.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h index 6ad3fba0df032d..5ee07823c64974 100644 --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -259,9 +259,10 @@ LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { #else // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__. // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane. - // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | // So we only reverse groups of 8 bytes, these groups are necessarily within a // 16-byte lane. + // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes | + // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> | const __m512i indices = _mm512_set_epi8(56, 57, 58, 59, 60, 61, 62, 63, // 48, 49, 50, 51, 52, 53, 54, 55, // 40, 41, 42, 43, 44, 45, 46, 47, // _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits