Author: Oliver Stannard Date: 2025-03-04T08:10:22Z New Revision: a619a2e53a9ba09ba18a047b8389bf4dd1912b72
URL: https://github.com/llvm/llvm-project/commit/a619a2e53a9ba09ba18a047b8389bf4dd1912b72 DIFF: https://github.com/llvm/llvm-project/commit/a619a2e53a9ba09ba18a047b8389bf4dd1912b72.diff LOG: [ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068) In arm-neon.h, we insert shufflevectors around each intrinsic when the target is big-endian, to compensate for the difference between the ABI-defined memory format of vectors (with the whole vector stored as one big-endian access) and LLVM's target-independent expectations (with the lowest-numbered lane in the lowest address). However, this code was written for the AArch64 ABI, and the AArch32 ABI differs slightly: it requires that vectors are stored in memory as-if stored with VSTM, which does a series of 64-bit accesses, instead of the AArch64 VSTR, which does a single 128-bit access. This means that for AArch32 we need to reverse the lanes in each 64-bit chunk of the vector, instead of in the whole vector. Since there are only a small number of different shufflevector orderings needed, I've split them out into macros, so that this doesn't need separate conditions in each intrinsic definition. Added: clang/test/CodeGen/arm-neon-endianness.c Modified: clang/utils/TableGen/NeonEmitter.cpp Removed: ################################################################################ diff --git a/clang/test/CodeGen/arm-neon-endianness.c b/clang/test/CodeGen/arm-neon-endianness.c new file mode 100644 index 0000000000000..ba2471ee39d3e --- /dev/null +++ b/clang/test/CodeGen/arm-neon-endianness.c @@ -0,0 +1,115 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// REQUIRES: arm-registered-target + +// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \ +// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE +// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \ +// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE + +#include <arm_neon.h> + +// LE-LABEL: define dso_local i32 @int32x4_t_lane_0( +// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x4_t_lane_0( +// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); } +// LE-LABEL: define dso_local i32 @int32x4_t_lane_1( +// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x4_t_lane_1( +// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); } +// LE-LABEL: define dso_local i32 @int32x4_t_lane_2( +// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x4_t_lane_2( +// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); } +// LE-LABEL: define dso_local i32 @int32x4_t_lane_3( +// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x4_t_lane_3( +// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); } +// LE-LABEL: define dso_local i32 @int32x2_t_lane_0( +// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x2_t_lane_0( +// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); } +// LE-LABEL: define dso_local i32 @int32x2_t_lane_1( +// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1 +// LE-NEXT: ret i32 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i32 @int32x2_t_lane_1( +// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0 +// BE-NEXT: ret i32 [[VGET_LANE]] +// +int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); } +// LE-LABEL: define dso_local i64 @int64x2_t_lane_0( +// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0 +// LE-NEXT: ret i64 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i64 @int64x2_t_lane_0( +// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0 +// BE-NEXT: ret i64 [[VGET_LANE]] +// +int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); } +// LE-LABEL: define dso_local i64 @int64x2_t_lane_1( +// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// LE-NEXT: [[ENTRY:.*:]] +// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1 +// LE-NEXT: ret i64 [[VGET_LANE]] +// +// BE-LABEL: define dso_local i64 @int64x2_t_lane_1( +// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// BE-NEXT: [[ENTRY:.*:]] +// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1 +// BE-NEXT: ret i64 [[VGET_LANE]] +// +int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); } diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index a18f78697af1c..5669b5e329587 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1263,20 +1263,17 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) { for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) { OS << " " << Dest.getName() << ".val[" << K << "] = " - << "__builtin_shufflevector(" - << Src.getName() << ".val[" << K << "], " - << Src.getName() << ".val[" << K << "]"; - for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J) - OS << ", " << J; - OS << ");"; + << "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], " + << Src.getName() << ".val[" << K << "], __lane_reverse_" + << Dest.getType().getSizeInBits() << "_" + << Dest.getType().getElementSizeInBits() << ");"; emitNewLine(); } } else { - OS << " " << Dest.getName() - << " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName(); - for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J) - OS << ", " << J; - OS << ");"; + OS << " " << Dest.getName() << " = __builtin_shufflevector(" + << Src.getName() << ", " << Src.getName() << ", __lane_reverse_" + << Dest.getType().getSizeInBits() << "_" + << Dest.getType().getElementSizeInBits() << ");"; emitNewLine(); } } @@ -1877,10 +1874,11 @@ std::string Intrinsic::generate() { OS << "#else\n"; - // Big endian intrinsics are more complex. The user intended these - // intrinsics to operate on a vector "as-if" loaded by (V)LDR, - // but we load as-if (V)LD1. So we should swap all arguments and - // swap the return value too. + // Big endian intrinsics are more complex. The user intended these intrinsics + // to operate on a vector "as-if" loaded by LDR (for AArch64), VLDR (for + // 64-bit vectors on AArch32), or VLDM (for 128-bit vectors on AArch32) but + // we load as-if LD1 (for AArch64) or VLD1 (for AArch32). So we should swap + // all arguments and swap the return value too. // // If we call sub-intrinsics, we should call a version that does // not re-swap the arguments! @@ -2434,6 +2432,31 @@ void NeonEmitter::run(raw_ostream &OS) { OS << "#define __ai static __inline__ __attribute__((__always_inline__, " "__nodebug__))\n\n"; + // Shufflevector arguments lists for endian-swapping vectors for big-endian + // targets. For AArch64, we need to reverse every lane in the vector, but for + // AArch32 we need to reverse the lanes within each 64-bit chunk of the + // vector. The naming convention here is __lane_reverse_<n>_<m>, where <n> is + // the length of the vector in bits, and <m> is length of each lane in bits. + OS << "#if !defined(__LITTLE_ENDIAN__)\n"; + OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n"; + OS << "#define __lane_reverse_64_32 1,0\n"; + OS << "#define __lane_reverse_64_16 3,2,1,0\n"; + OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n"; + OS << "#define __lane_reverse_128_64 1,0\n"; + OS << "#define __lane_reverse_128_32 3,2,1,0\n"; + OS << "#define __lane_reverse_128_16 7,6,5,4,3,2,1,0\n"; + OS << "#define __lane_reverse_128_8 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0\n"; + OS << "#else\n"; + OS << "#define __lane_reverse_64_32 1,0\n"; + OS << "#define __lane_reverse_64_16 3,2,1,0\n"; + OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n"; + OS << "#define __lane_reverse_128_64 0,1\n"; + OS << "#define __lane_reverse_128_32 1,0,3,2\n"; + OS << "#define __lane_reverse_128_16 3,2,1,0,7,6,5,4\n"; + OS << "#define __lane_reverse_128_8 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8\n"; + OS << "#endif\n"; + OS << "#endif\n"; + SmallVector<Intrinsic *, 128> Defs; for (const Record *R : Records.getAllDerivedDefinitions("Inst")) createIntrinsic(R, Defs); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits