Issue |
128424
|
Summary |
[clang] Vectorizer fails to vectorize average round to single average round instruction
|
Labels |
clang
|
Assignees |
|
Reporter |
johnplatts
|
Here is a snippet of C++ code that Clang vectorizes to suboptimal code:
```
#include <stddef.h>
#include <stdint.h>
#include <array>
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")
#undef vector
#undef pixel
#undef bool
#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
#include <emmintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)
#include <altivec.h>
#elif defined(__s390x__) && defined(__VEC__)
#include <vecintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__aarch64__) || defined(__ARM_NEON))
#include <arm_neon.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif
#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")
#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif
std::array<uint8_t, 16> AverageRoundU8_1(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
std::array<uint8_t, 16> result;
for (size_t i = 0; i < 16; i++) {
result[i] = static_cast<uint8_t>((static_cast<uint16_t>(a[i]) +
static_cast<uint16_t>(b[i]) + uint16_t{1}) >> 1);
}
return result;
}
std::array<uint8_t, 16> AverageRoundU8_2(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
std::array<uint8_t, 16> result;
for (size_t i = 0; i < 16; i++) {
const uint8_t a_i = a[i];
const uint8_t b_i = b[i];
result[i] =
static_cast<uint8_t>(static_cast<unsigned>(a_i) >> 1) +
static_cast<uint8_t>(static_cast<unsigned>(b_i) >> 1) +
static_cast<uint8_t>((a_i | b_i) & 1u);
}
return result;
}
#if HAVE_SIMD_AVERAGE_ROUND
std::array<uint8_t, 16> AverageRoundU8_3(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
typedef uint8_t GccU8x16Vec __attribute__((__vector_size__(16), __aligned__(1), __may_alias__));
GccU8x16Vec vec_a = *reinterpret_cast<const GccU8x16Vec*>(a.data());
GccU8x16Vec vec_b = *reinterpret_cast<const GccU8x16Vec*>(b.data());
#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
GccU8x16Vec result_vec =
reinterpret_cast<GccU8x16Vec>(
_mm_avg_epu8(reinterpret_cast<__m128i>(vec_a), reinterpret_cast<__m128i>(vec_b)));
#elif ((defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)) || \
(defined(__s390x__) && defined(__VEC__))
GccU8x16Vec result_vec =
reinterpret_cast<GccU8x16Vec>(
vec_avg(reinterpret_cast<__vector unsigned char>(vec_a),
reinterpret_cast<__vector unsigned char>(vec_b)));
#elif (defined(__aarch64__) || defined(__ARM_NEON))
GccU8x16Vec result_vec =
reinterpret_cast<GccU8x16Vec>(
vrhaddq_u8(reinterpret_cast<uint8x16_t>(vec_a),
reinterpret_cast<uint8x16_t>(vec_b)));
#endif
std::array<uint8_t, 16> result;
*reinterpret_cast<GccU8x16Vec*>(result.data()) = result_vec;
return result;
}
#endif
```
GCC 14 does a much better job of vectorizing AverageRoundU8_1 than Clang 19 (or Clang trunk) does, and the compilation of the above snippet with various GCC and Clang releases can be found over in Compiler Explorer at https://godbolt.org/z/hanoerdPP.
A related issue regarding suboptimal LLVM codegen has been reported over at https://github.com/llvm/llvm-project/issues/128377.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs