On Sun, Jan 5, 2025 at 10:06 PM Dhruv Chawla <dhr...@nvidia.com> wrote: > > This patch modifies Advanced SIMD assembly generation to emit an LDR > instruction when a vector is created using a load to the first element with > the > other elements being zero. > > This is similar to what *aarch64_combinez<mode> already does. > > Example: > > uint8x16_t foo(uint8_t *x) { > uint8x16_t r = vdupq_n_u8(0); > r[0] = *x; > return r; > } > > Currently, this generates: > > foo: > movi v0.4s, 0 > ld1 {v0.b}[0], [x0] > ret > > After applying the patch, this generates: > > foo: > ldr b0, [x0] > ret > > Bootstrapped and regtested on aarch64-linux-gnu. > > Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md > (*aarch64_simd_vec_set_low<mode>): New pattern. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/simd/ldr_first_1.c: New test. > --- > gcc/config/aarch64/aarch64-simd.md | 12 ++++ > .../gcc.target/aarch64/simd/ldr_first_1.c | 55 +++++++++++++++++++ > 2 files changed, 67 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index 7959cca520a..b8a1e01b92f 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1164,6 +1164,18 @@ > [(set_attr "type" "neon_logic<q>")] > ) > > +(define_insn "*aarch64_simd_vec_set_low<mode>" > + [(set (match_operand:VALL_F16 0 "register_operand" "=w") > + (vec_merge:VALL_F16 > + (vec_duplicate:VALL_F16 > + (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" > "m")) > + (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i") > + (match_operand:SI 2 "immediate_operand" "i")))] > + "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"
This is NOT correct for big-endian. See https://gcc.gnu.org/pipermail/gcc-patches/2024-October/667088.html for a similar patch which had the big-endian fixes and handles more cases too. Thanks, Andrew > + "ldr\\t%<Vetype>0, %1" > + [(set_attr "type" "f_loads")] > +) > + > (define_insn "aarch64_simd_vec_set<mode>" > [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") > (vec_merge:VALL_F16 > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c > b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c > new file mode 100644 > index 00000000000..c7efde21041 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c > @@ -0,0 +1,55 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=armv8-a+bf16" } */ > + > +#include <arm_neon.h> > + > +#define LDR(S, T, U) > \ > + __attribute__ ((noinline)) S S##T##U (T *x) > \ > + { > \ > + S r = vdupq_n_##U (0); > \ > + r[0] = *x; > \ > + return r; > \ > + } > + > +LDR (int8x16_t, int8_t, s8) > +LDR (int16x8_t, int16_t, s16) > +LDR (int32x4_t, int32_t, s32) > +LDR (int64x2_t, int64_t, s64) > + > +LDR (uint8x16_t, uint8_t, u8) > +LDR (uint16x8_t, uint16_t, u16) > +LDR (uint32x4_t, uint32_t, u32) > +LDR (uint64x2_t, uint64_t, u64) > + > +LDR (float16x8_t, float16_t, f16) > +LDR (float32x4_t, float32_t, f32) > +LDR (float64x2_t, float64_t, f64) > + > +LDR (bfloat16x8_t, bfloat16_t, bf16) > + > +#define LDR_NARROW(S, T, U) > \ > + __attribute__ ((noinline)) S S##T##U (T *x) > \ > + { > \ > + S r = vdup_n_##U (0); > \ > + r[0] = *x; > \ > + return r; > \ > + } > + > +LDR_NARROW (int8x8_t, int8_t, s8) > +LDR_NARROW (int16x4_t, int16_t, s16) > +LDR_NARROW (int32x2_t, int32_t, s32) > +LDR_NARROW (int64x1_t, int64_t, s64) > + > +LDR_NARROW (uint8x8_t, uint8_t, u8) > +LDR_NARROW (uint16x4_t, uint16_t, u16) > +LDR_NARROW (uint32x2_t, uint32_t, u32) > +LDR_NARROW (uint64x1_t, uint64_t, u64) > + > +LDR_NARROW (float16x4_t, float16_t, f16) > +LDR_NARROW (float32x2_t, float32_t, f32) > +LDR_NARROW (float64x1_t, float64_t, f64) > + > +LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16) > + > +/* { dg-final { scan-assembler-times "\\tldr" 24 } } */ > +/* { dg-final { scan-assembler-not "\\tmov" } } */ > -- > 2.44.0