Issue 128424
Summary [clang] Vectorizer fails to vectorize average round to single average round instruction
Labels clang
Assignees
Reporter johnplatts
    Here is a snippet of C++ code that Clang vectorizes to suboptimal code:
```
#include <stddef.h>
#include <stdint.h>
#include <array>

#if defined(__GNUC__) || defined(__clang__)

#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
#include <emmintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)
#include <altivec.h>
#elif defined(__s390x__) && defined(__VEC__)
#include <vecintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__aarch64__) || defined(__ARM_NEON))
#include <arm_neon.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif


std::array<uint8_t, 16> AverageRoundU8_1(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
 std::array<uint8_t, 16> result;
  for (size_t i = 0; i < 16; i++) {
 result[i] = static_cast<uint8_t>((static_cast<uint16_t>(a[i]) +
 static_cast<uint16_t>(b[i]) + uint16_t{1}) >> 1);
  }
  return result;
}


std::array<uint8_t, 16> AverageRoundU8_2(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
  std::array<uint8_t, 16> result;
  for (size_t i = 0; i < 16; i++) {
    const uint8_t a_i = a[i];
    const uint8_t b_i = b[i];
    result[i] =
 static_cast<uint8_t>(static_cast<unsigned>(a_i) >> 1) +
 static_cast<uint8_t>(static_cast<unsigned>(b_i) >> 1) +
 static_cast<uint8_t>((a_i | b_i) & 1u);
  }
  return result;
}

#if HAVE_SIMD_AVERAGE_ROUND
std::array<uint8_t, 16> AverageRoundU8_3(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
 typedef uint8_t GccU8x16Vec __attribute__((__vector_size__(16), __aligned__(1), __may_alias__));

  GccU8x16Vec vec_a = *reinterpret_cast<const GccU8x16Vec*>(a.data());
  GccU8x16Vec vec_b = *reinterpret_cast<const GccU8x16Vec*>(b.data());

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
  GccU8x16Vec result_vec =
 reinterpret_cast<GccU8x16Vec>(
 _mm_avg_epu8(reinterpret_cast<__m128i>(vec_a), reinterpret_cast<__m128i>(vec_b)));
#elif ((defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)) || \
 (defined(__s390x__) && defined(__VEC__))
  GccU8x16Vec result_vec =
 reinterpret_cast<GccU8x16Vec>(
      vec_avg(reinterpret_cast<__vector unsigned char>(vec_a),
              reinterpret_cast<__vector unsigned char>(vec_b)));
#elif (defined(__aarch64__) || defined(__ARM_NEON))
 GccU8x16Vec result_vec =
    reinterpret_cast<GccU8x16Vec>(
 vrhaddq_u8(reinterpret_cast<uint8x16_t>(vec_a),
 reinterpret_cast<uint8x16_t>(vec_b)));
#endif

  std::array<uint8_t, 16> result;
  *reinterpret_cast<GccU8x16Vec*>(result.data()) = result_vec;
 return result;
}
#endif
```

GCC 14 does a much better job of vectorizing AverageRoundU8_1 than Clang 19 (or Clang trunk) does, and the compilation of the above snippet with various GCC and Clang releases can be found over in Compiler Explorer at https://godbolt.org/z/hanoerdPP.

A related issue regarding suboptimal LLVM codegen has been reported over at https://github.com/llvm/llvm-project/issues/128377.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to