Issue |
127134
|
Summary |
[AArch64] auto-vectorizer regression
|
Labels |
new issue
|
Assignees |
|
Reporter |
appujee
|
Baseline: c4c5e79dd4b4c78eee7cffd9b0d7394b5bedcf12
Regressing version: 3c92011b600bdf70424e2547594dd461fe411a41 or more recently f142f8afe21bceb00fb495468aa0b5043e98c419
Command to repro:
```
$CC -c -nostdlibinc -D__BIONIC_NO_PAGE_SIZE_MACRO -O2 -DANDROID -DNDEBUG -UDEBUG -D__compiler_offsetof=__builtin_offsetof -D__ANDROID_UNAVAILABLE_SYMBOLS_ARE_WEAK__ -faddrsig -fcolor-diagnostics -ffp-contract=off -fno-exceptions -fno-strict-aliasing -fmessage-length=0 -gsimple-template-names -gz=zstd -no-canonical-prefixes -ftrivial-auto-var-init=zero -ffunction-sections -fdata-sections -fno-short-enums -funwind-tables -fstack-protector-strong -Wa,--noexecstack -D_FORTIFY_SOURCE=2 -Wstrict-aliasing=2 -march=armv8.2-a -mcpu=cortex-a55 -target aarch64-linux-android10000 -DANDROID_STRICT -fPIE -Wimplicit-fallthrough -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS -Wno-gnu-include-next -fvisibility-inlines-hidden -Isystem/media/audio_utils/benchmarks -D__LIBC_API__=10000 -D__LIBM_API__=10000 -D__LIBDL_API__=10000 -Iexternal/google-benchmark/include -Iprebuilts/clang/host/linux-x86/clang-${cver}/android_libc++/platform/aarch64/include/c++/v1 -Iprebuilts/clang/host/linux-x86/clang-${cver}/include/c++/v1 -Ibionic/libc/async_safe/include -Isystem/logging/liblog/include -Ibionic/libc/system_properties/include -Isystem/core/property_service/libpropertyinfoparser/include -Ibionic/libdl/include_private -isystem bionic/libc/include -isystem bionic/libc/kernel/uapi/asm-arm64 -isystem bionic/libc/kernel/uapi -isystem bionic/libc/kernel/android/scsi -isystem bionic/libc/kernel/android/uapi -fsanitize=memtag-heap -fsanitize-trap=all -std=gnu++20 -fno-rtti -Isystem/core/include -Isystem/logging/liblog/include -Isystem/media/audio/include -Ihardware/libhardware/include -Ihardware/libhardware_legacy/include -Ihardware/ril/include -Iframeworks/native/include -Iframeworks/native/opengl/include -Iframeworks/av/include -o audio_vectorization_benchmark.o system/media/audio_utils/benchmarks/audio_vectorization_benchmark.cpp --save-temps
```
$ grep '[u|f]cvt' audio_vectorization_benchmark_new.s
24
$ grep '[u|f]cvt' audio_vectorization_benchmark_baseline.s
42
Haven't reduced it yet, but here is the full source code.
```cpp
#include <functional>
#include <random>
#include <vector>
#include <benchmark/benchmark.h>
// A small subset of code from audio_utils/intrinsic_utils.h
// We conditionally include neon optimizations for ARM devices
#pragma push_macro("USE_NEON")
#undef USE_NEON
#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#define USE_NEON
#endif
template <typename T>
inline constexpr bool dependent_false_v = false;
// Type of array embedded in a struct that is usable in the Neon template functions below.
// This type must satisfy std::is_array_v<>.
template<typename T, size_t N>
struct internal_array_t {
T v[N];
static constexpr size_t size() { return N; }
};
#ifdef USE_NEON
template<int N>
struct vfloat_struct {};
template<int N>
using vfloat_t = typename vfloat_struct<N>::t; // typnemae required for Android 14 and earlier.
template<typename F, int N>
using vector_hw_t = std::conditional_t<
std::is_same_v<F, float>, vfloat_t<N>, internal_array_t<F, N>>;
#else
// use loop vectorization if no HW type exists.
template<typename F, int N>
using vector_hw_t = internal_array_t<F, N>;
#endif
template<typename T>
static inline T vmul(T a, T b) {
if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
return a * b;
#ifdef USE_NEON
} else if constexpr (std::is_same_v<T, float32x2_t>) {
return vmul_f32(a, b);
} else if constexpr (std::is_same_v<T, float32x4_t>) {
return vmulq_f32(a, b);
#if defined(__aarch64__)
} else if constexpr (std::is_same_v<T, float64x2_t>) {
return vmulq_f64(a, b);
#endif
#endif // USE_NEON
} else /* constexpr */ {
T ret;
auto &[retval] = ret; // single-member struct
const auto &[aval] = a;
const auto &[bval] = b;
if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
for (size_t i = 0; i < std::size(aval); ++i) {
retval[i] = vmul(aval[i], bval[i]);
}
return ret;
} else /* constexpr */ {
auto &[r1, r2] = retval;
const auto &[a1, a2] = aval;
const auto &[b1, b2] = bval;
r1 = vmul(a1, b1);
r2 = vmul(a2, b2);
return ret;
}
}
}
#pragma pop_macro("USE_NEON")
// end intrinsics subset
static constexpr size_t kDataSize = 2048;
static void TestArgs(benchmark::internal::Benchmark* b) {
constexpr int kChannelCountMin = 1;
constexpr int kChannelCountMax = 32;
for (int i = kChannelCountMin; i <= kChannelCountMax; ++i) {
b->Args({i});
}
}
// Macro test operator
#define OPERATOR(N) \
*reinterpret_cast<V<F, N>*>(out) = vmul( \
*reinterpret_cast<const V<F, N>*>(in1), \
*reinterpret_cast<const V<F, N>*>(in2)); \
out += N; \
in1 += N; \
in2 += N;
// Macro to instantiate switch case statements.
#define INSTANTIATE(N) \
case N: \
mFunc = [](F* out, const F* in1, const F* in2, size_t count) { \
static_assert(sizeof(V<F, N>) == N * sizeof(F)); \
for (size_t i = 0; i < count; ++i) { \
OPERATOR(N); \
} \
}; \
break;
template <typename Traits>
class Processor {
public:
// shorthand aliases
using F = typename Traits::data_t;
template <typename T, int N>
using V = typename Traits::template container_t<T, N>;
Processor(int channelCount)
: mChannelCount(channelCount) {
if constexpr (Traits::loop_) {
mFunc = [channelCount](F* out, const F* in1, const F* in2, size_t count) {
for (size_t i = 0; i < count; ++i) {
for (size_t j = 0; j < channelCount; ++j) {
OPERATOR(1);
}
}
};
return;
}
switch (channelCount) {
INSTANTIATE(1);
INSTANTIATE(2);
INSTANTIATE(3);
INSTANTIATE(4);
INSTANTIATE(5);
INSTANTIATE(6);
INSTANTIATE(7);
INSTANTIATE(8);
INSTANTIATE(9);
INSTANTIATE(10);
INSTANTIATE(11);
INSTANTIATE(12);
INSTANTIATE(13);
INSTANTIATE(14);
INSTANTIATE(15);
INSTANTIATE(16);
INSTANTIATE(17);
INSTANTIATE(18);
INSTANTIATE(19);
INSTANTIATE(20);
INSTANTIATE(21);
INSTANTIATE(22);
INSTANTIATE(23);
INSTANTIATE(24);
INSTANTIATE(25);
INSTANTIATE(26);
INSTANTIATE(27);
INSTANTIATE(28);
INSTANTIATE(29);
INSTANTIATE(30);
INSTANTIATE(31);
INSTANTIATE(32);
}
}
void process(F* out, const F* in1, const F* in2, size_t frames) {
mFunc(out, in1, in2, frames);
}
const size_t mChannelCount;
/* const */ std::function<void(F*, const F*, const F*, size_t)> mFunc;
};
template <typename Traits>
static void BM_VectorTest(benchmark::State& state) {
using F = typename Traits::data_t;
const size_t channelCount = state.range(0);
std::vector<F> input1(kDataSize * channelCount);
std::vector<F> input2(kDataSize * channelCount);
std::vector<F> output(kDataSize * channelCount);
// Initialize input buffer and coefs with deterministic pseudo-random values
std::minstd_rand gen(42);
const F amplitude = 1.;
std::uniform_real_distribution<> dis(-amplitude, amplitude);
for (auto& in : input1) {
in = dis(gen);
}
for (auto& in : input2) {
in = dis(gen);
}
Processor<Traits> processor(channelCount);
// Run the test
while (state.KeepRunning()) {
benchmark::DoNotOptimize(input1.data());
benchmark::DoNotOptimize(input2.data());
benchmark::DoNotOptimize(output.data());
processor.process(output.data(), input1.data(), input2.data(), kDataSize);
benchmark::ClobberMemory();
}
state.SetComplexityN(channelCount);
}
// Clang has an issue with -frelaxed-template-template-args where
// it may not follow the C++17 guidelines. Use a traits struct to
// pass in parameters.
// Test using two loops.
struct LoopFloatTraits {
template <typename F, int N>
using container_t = internal_array_t<F, N>;
using data_t = float;
static constexpr bool loop_ = true;
};
static void BM_VectorTestLoopFloat(benchmark::State& state) {
BM_VectorTest<LoopFloatTraits>(state);
}
BENCHMARK(BM_VectorTestLoopFloat)->Apply(TestArgs);
BENCHMARK_MAIN();
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs