On Sun, Jan 5, 2025 at 10:06 PM Dhruv Chawla <dhr...@nvidia.com> wrote:
>
> This patch modifies Advanced SIMD assembly generation to emit an LDR
> instruction when a vector is created using a load to the first element with 
> the
> other elements being zero.
>
> This is similar to what *aarch64_combinez<mode> already does.
>
> Example:
>
> uint8x16_t foo(uint8_t *x) {
>    uint8x16_t r = vdupq_n_u8(0);
>    r[0] = *x;
>    return r;
> }
>
> Currently, this generates:
>
> foo:
>         movi    v0.4s, 0
>         ld1     {v0.b}[0], [x0]
>         ret
>
> After applying the patch, this generates:
>
> foo:
>         ldr     b0, [x0]
>         ret
>
> Bootstrapped and regtested on aarch64-linux-gnu.
>
> Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd.md
>         (*aarch64_simd_vec_set_low<mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/simd/ldr_first_1.c: New test.
> ---
>   gcc/config/aarch64/aarch64-simd.md            | 12 ++++
>   .../gcc.target/aarch64/simd/ldr_first_1.c     | 55 +++++++++++++++++++
>   2 files changed, 67 insertions(+)
>   create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 7959cca520a..b8a1e01b92f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1164,6 +1164,18 @@
>     [(set_attr "type" "neon_logic<q>")]
>   )
>
> +(define_insn "*aarch64_simd_vec_set_low<mode>"
> +  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> +       (vec_merge:VALL_F16
> +           (vec_duplicate:VALL_F16
> +               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" 
> "m"))
> +           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
> +           (match_operand:SI 2 "immediate_operand" "i")))]
> +  "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"

This is NOT correct for big-endian.
See https://gcc.gnu.org/pipermail/gcc-patches/2024-October/667088.html
for a similar patch which had the big-endian fixes and handles more
cases too.

Thanks,
Andrew

> +  "ldr\\t%<Vetype>0, %1"
> +  [(set_attr "type" "f_loads")]
> +)
> +
>   (define_insn "aarch64_simd_vec_set<mode>"
>     [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
>         (vec_merge:VALL_F16
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
> new file mode 100644
> index 00000000000..c7efde21041
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
> @@ -0,0 +1,55 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a+bf16" } */
> +
> +#include <arm_neon.h>
> +
> +#define LDR(S, T, U)                                                         
>   \
> +  __attribute__ ((noinline)) S S##T##U (T *x)                                
>   \
> +  {                                                                          
>   \
> +    S r = vdupq_n_##U (0);                                                   
>   \
> +    r[0] = *x;                                                               
>   \
> +    return r;                                                                
>   \
> +  }
> +
> +LDR (int8x16_t, int8_t, s8)
> +LDR (int16x8_t, int16_t, s16)
> +LDR (int32x4_t, int32_t, s32)
> +LDR (int64x2_t, int64_t, s64)
> +
> +LDR (uint8x16_t, uint8_t, u8)
> +LDR (uint16x8_t, uint16_t, u16)
> +LDR (uint32x4_t, uint32_t, u32)
> +LDR (uint64x2_t, uint64_t, u64)
> +
> +LDR (float16x8_t, float16_t, f16)
> +LDR (float32x4_t, float32_t, f32)
> +LDR (float64x2_t, float64_t, f64)
> +
> +LDR (bfloat16x8_t, bfloat16_t, bf16)
> +
> +#define LDR_NARROW(S, T, U)                                                  
>   \
> +  __attribute__ ((noinline)) S S##T##U (T *x)                                
>   \
> +  {                                                                          
>   \
> +    S r = vdup_n_##U (0);                                                    
>   \
> +    r[0] = *x;                                                               
>   \
> +    return r;                                                                
>   \
> +  }
> +
> +LDR_NARROW (int8x8_t, int8_t, s8)
> +LDR_NARROW (int16x4_t, int16_t, s16)
> +LDR_NARROW (int32x2_t, int32_t, s32)
> +LDR_NARROW (int64x1_t, int64_t, s64)
> +
> +LDR_NARROW (uint8x8_t, uint8_t, u8)
> +LDR_NARROW (uint16x4_t, uint16_t, u16)
> +LDR_NARROW (uint32x2_t, uint32_t, u32)
> +LDR_NARROW (uint64x1_t, uint64_t, u64)
> +
> +LDR_NARROW (float16x4_t, float16_t, f16)
> +LDR_NARROW (float32x2_t, float32_t, f32)
> +LDR_NARROW (float64x1_t, float64_t, f64)
> +
> +LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16)
> +
> +/* { dg-final { scan-assembler-times "\\tldr" 24 } } */
> +/* { dg-final { scan-assembler-not "\\tmov" } } */
> --
> 2.44.0

Reply via email to