This patch modifies Advanced SIMD assembly generation to emit an LDR instruction when a vector is created using a load to the first element with the other elements being zero.
This is similar to what *aarch64_combinez<mode> already does. Example: uint8x16_t foo(uint8_t *x) { uint8x16_t r = vdupq_n_u8(0); r[0] = *x; return r; } Currently, this generates: foo: movi v0.4s, 0 ld1 {v0.b}[0], [x0] ret After applying the patch, this generates: foo: ldr b0, [x0] ret Bootstrapped and regtested on aarch64-linux-gnu. Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_simd_vec_set_low<mode>): New pattern. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/ldr_first_1.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 12 ++++ .../gcc.target/aarch64/simd/ldr_first_1.c | 55 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7959cca520a..b8a1e01b92f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1164,6 +1164,18 @@ [(set_attr "type" "neon_logic<q>")] )+(define_insn "*aarch64_simd_vec_set_low<mode>"
+ [(set (match_operand:VALL_F16 0 "register_operand" "=w") + (vec_merge:VALL_F16 + (vec_duplicate:VALL_F16 + (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m")) + (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0" + "ldr\\t%<Vetype>0, %1" + [(set_attr "type" "f_loads")] +) + (define_insn "aarch64_simd_vec_set<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") (vec_merge:VALL_F16 diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c new file mode 100644 index 00000000000..c7efde21041 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a+bf16" } */ + +#include <arm_neon.h> + +#define LDR(S, T, U) \ + __attribute__ ((noinline)) S S##T##U (T *x) \ + { \ + S r = vdupq_n_##U (0); \ + r[0] = *x; \ + return r; \ + } + +LDR (int8x16_t, int8_t, s8) +LDR (int16x8_t, int16_t, s16) +LDR (int32x4_t, int32_t, s32) +LDR (int64x2_t, int64_t, s64) + +LDR (uint8x16_t, uint8_t, u8) +LDR (uint16x8_t, uint16_t, u16) +LDR (uint32x4_t, uint32_t, u32) +LDR (uint64x2_t, uint64_t, u64) + +LDR (float16x8_t, float16_t, f16) +LDR (float32x4_t, float32_t, f32) +LDR (float64x2_t, float64_t, f64) + +LDR (bfloat16x8_t, bfloat16_t, bf16) + +#define LDR_NARROW(S, T, U) \ + __attribute__ ((noinline)) S S##T##U (T *x) \ + { \ + S r = vdup_n_##U (0); \ + r[0] = *x; \ + return r; \ + } + +LDR_NARROW (int8x8_t, int8_t, s8) +LDR_NARROW (int16x4_t, int16_t, s16) +LDR_NARROW (int32x2_t, int32_t, s32) +LDR_NARROW (int64x1_t, int64_t, s64) + +LDR_NARROW (uint8x8_t, uint8_t, u8) +LDR_NARROW (uint16x4_t, uint16_t, u16) +LDR_NARROW (uint32x2_t, uint32_t, u32) +LDR_NARROW (uint64x1_t, uint64_t, u64) + +LDR_NARROW (float16x4_t, float16_t, f16) +LDR_NARROW (float32x2_t, float32_t, f32) +LDR_NARROW (float64x1_t, float64_t, f64) + +LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16) + +/* { dg-final { scan-assembler-times "\\tldr" 24 } } */ +/* { dg-final { scan-assembler-not "\\tmov" } } */ -- 2.44.0
From 34024558ad9fa831b337f40e01e2f0c743a1de7b Mon Sep 17 00:00:00 2001 From: Dhruv Chawla <dhr...@nvidia.com> Date: Thu, 19 Dec 2024 19:56:23 -0800 Subject: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD This patch modifies Advanced SIMD assembly generation to emit an LDR instruction when a vector is created using a load to the first element with the other elements being zero. This is similar to what *aarch64_combinez<mode> already does. Example: uint8x16_t foo(uint8_t *x) { uint8x16_t r = vdupq_n_u8(0); r[0] = *x; return r; } Currently, this generates: foo: movi v0.4s, 0 ld1 {v0.b}[0], [x0] ret After applying the patch, this generates: foo: ldr b0, [x0] ret Bootstrapped and regtested on aarch64-linux-gnu. Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_simd_vec_set_low<mode>): New pattern. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/ldr_first_1.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 12 ++++ .../gcc.target/aarch64/simd/ldr_first_1.c | 55 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7959cca520a..b8a1e01b92f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1164,6 +1164,18 @@ [(set_attr "type" "neon_logic<q>")] ) +(define_insn "*aarch64_simd_vec_set_low<mode>" + [(set (match_operand:VALL_F16 0 "register_operand" "=w") + (vec_merge:VALL_F16 + (vec_duplicate:VALL_F16 + (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m")) + (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i") + (match_operand:SI 2 "immediate_operand" "i")))] + "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0" + "ldr\\t%<Vetype>0, %1" + [(set_attr "type" "f_loads")] +) + (define_insn "aarch64_simd_vec_set<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") (vec_merge:VALL_F16 diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c new file mode 100644 index 00000000000..c7efde21041 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8-a+bf16" } */ + +#include <arm_neon.h> + +#define LDR(S, T, U) \ + __attribute__ ((noinline)) S S##T##U (T *x) \ + { \ + S r = vdupq_n_##U (0); \ + r[0] = *x; \ + return r; \ + } + +LDR (int8x16_t, int8_t, s8) +LDR (int16x8_t, int16_t, s16) +LDR (int32x4_t, int32_t, s32) +LDR (int64x2_t, int64_t, s64) + +LDR (uint8x16_t, uint8_t, u8) +LDR (uint16x8_t, uint16_t, u16) +LDR (uint32x4_t, uint32_t, u32) +LDR (uint64x2_t, uint64_t, u64) + +LDR (float16x8_t, float16_t, f16) +LDR (float32x4_t, float32_t, f32) +LDR (float64x2_t, float64_t, f64) + +LDR (bfloat16x8_t, bfloat16_t, bf16) + +#define LDR_NARROW(S, T, U) \ + __attribute__ ((noinline)) S S##T##U (T *x) \ + { \ + S r = vdup_n_##U (0); \ + r[0] = *x; \ + return r; \ + } + +LDR_NARROW (int8x8_t, int8_t, s8) +LDR_NARROW (int16x4_t, int16_t, s16) +LDR_NARROW (int32x2_t, int32_t, s32) +LDR_NARROW (int64x1_t, int64_t, s64) + +LDR_NARROW (uint8x8_t, uint8_t, u8) +LDR_NARROW (uint16x4_t, uint16_t, u16) +LDR_NARROW (uint32x2_t, uint32_t, u32) +LDR_NARROW (uint64x1_t, uint64_t, u64) + +LDR_NARROW (float16x4_t, float16_t, f16) +LDR_NARROW (float32x2_t, float32_t, f32) +LDR_NARROW (float64x1_t, float64_t, f64) + +LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16) + +/* { dg-final { scan-assembler-times "\\tldr" 24 } } */ +/* { dg-final { scan-assembler-not "\\tmov" } } */ -- 2.44.0