[PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

Dhruv Chawla Sun, 05 Jan 2025 22:05:26 -0800

This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.


This is similar to what *aarch64_combinez<mode> already does.

Example:

uint8x16_t foo(uint8_t *x) {
  uint8x16_t r = vdupq_n_u8(0);
  r[0] = *x;
  return r;
}

Currently, this generates:

foo:
        movi    v0.4s, 0
        ld1     {v0.b}[0], [x0]
        ret

After applying the patch, this generates:

foo:
        ldr     b0, [x0]
        ret

Bootstrapped and regtested on aarch64-linux-gnu.

Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (*aarch64_simd_vec_set_low<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/ldr_first_1.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 12 ++++
 .../gcc.target/aarch64/simd/ldr_first_1.c     | 55 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 7959cca520a..b8a1e01b92f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
   [(set_attr "type" "neon_logic<q>")]
 )

+(define_insn "*aarch64_simd_vec_set_low<mode>"

+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+           (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"
+  "ldr\\t%<Vetype>0, %1"
+  [(set_attr "type" "f_loads")]
+)
+
 (define_insn "aarch64_simd_vec_set<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
        (vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
new file mode 100644
index 00000000000..c7efde21041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+bf16" } */
+
+#include <arm_neon.h>
+
+#define LDR(S, T, U)                                                           
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR (int8x16_t, int8_t, s8)
+LDR (int16x8_t, int16_t, s16)
+LDR (int32x4_t, int32_t, s32)
+LDR (int64x2_t, int64_t, s64)
+
+LDR (uint8x16_t, uint8_t, u8)
+LDR (uint16x8_t, uint16_t, u16)
+LDR (uint32x4_t, uint32_t, u32)
+LDR (uint64x2_t, uint64_t, u64)
+
+LDR (float16x8_t, float16_t, f16)
+LDR (float32x4_t, float32_t, f32)
+LDR (float64x2_t, float64_t, f64)
+
+LDR (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_NARROW(S, T, U)                                                    
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_NARROW (int8x8_t, int8_t, s8)
+LDR_NARROW (int16x4_t, int16_t, s16)
+LDR_NARROW (int32x2_t, int32_t, s32)
+LDR_NARROW (int64x1_t, int64_t, s64)
+
+LDR_NARROW (uint8x8_t, uint8_t, u8)
+LDR_NARROW (uint16x4_t, uint16_t, u16)
+LDR_NARROW (uint32x2_t, uint32_t, u32)
+LDR_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_NARROW (float16x4_t, float16_t, f16)
+LDR_NARROW (float32x2_t, float32_t, f32)
+LDR_NARROW (float64x1_t, float64_t, f64)
+
+LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 24 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
--
2.44.0

From 34024558ad9fa831b337f40e01e2f0c743a1de7b Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhr...@nvidia.com>
Date: Thu, 19 Dec 2024 19:56:23 -0800
Subject: [PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

This patch modifies Advanced SIMD assembly generation to emit an LDR
instruction when a vector is created using a load to the first element with the
other elements being zero.

This is similar to what *aarch64_combinez<mode> already does.

Example:

uint8x16_t foo(uint8_t *x) {
  uint8x16_t r = vdupq_n_u8(0);
  r[0] = *x;
  return r;
}

Currently, this generates:

foo:
        movi    v0.4s, 0
        ld1     {v0.b}[0], [x0]
        ret

After applying the patch, this generates:

foo:
        ldr     b0, [x0]
        ret

Bootstrapped and regtested on aarch64-linux-gnu.

Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md
        (*aarch64_simd_vec_set_low<mode>): New pattern.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/ldr_first_1.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 12 ++++
 .../gcc.target/aarch64/simd/ldr_first_1.c     | 55 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 7959cca520a..b8a1e01b92f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1164,6 +1164,18 @@
   [(set_attr "type" "neon_logic<q>")]
 )
 
+(define_insn "*aarch64_simd_vec_set_low<mode>"
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (vec_merge:VALL_F16
+           (vec_duplicate:VALL_F16
+               (match_operand:<VEL> 1 "aarch64_simd_nonimmediate_operand" "m"))
+           (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "i")
+           (match_operand:SI 2 "immediate_operand" "i")))]
+  "TARGET_FLOAT && exact_log2 (INTVAL (operands[2])) == 0"
+  "ldr\\t%<Vetype>0, %1"
+  [(set_attr "type" "f_loads")]
+)
+
 (define_insn "aarch64_simd_vec_set<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
        (vec_merge:VALL_F16
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
new file mode 100644
index 00000000000..c7efde21041
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ldr_first_1.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+bf16" } */
+
+#include <arm_neon.h>
+
+#define LDR(S, T, U)                                                           
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdupq_n_##U (0);                                                     
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR (int8x16_t, int8_t, s8)
+LDR (int16x8_t, int16_t, s16)
+LDR (int32x4_t, int32_t, s32)
+LDR (int64x2_t, int64_t, s64)
+
+LDR (uint8x16_t, uint8_t, u8)
+LDR (uint16x8_t, uint16_t, u16)
+LDR (uint32x4_t, uint32_t, u32)
+LDR (uint64x2_t, uint64_t, u64)
+
+LDR (float16x8_t, float16_t, f16)
+LDR (float32x4_t, float32_t, f32)
+LDR (float64x2_t, float64_t, f64)
+
+LDR (bfloat16x8_t, bfloat16_t, bf16)
+
+#define LDR_NARROW(S, T, U)                                                    
\
+  __attribute__ ((noinline)) S S##T##U (T *x)                                  
\
+  {                                                                            
\
+    S r = vdup_n_##U (0);                                                      
\
+    r[0] = *x;                                                                 
\
+    return r;                                                                  
\
+  }
+
+LDR_NARROW (int8x8_t, int8_t, s8)
+LDR_NARROW (int16x4_t, int16_t, s16)
+LDR_NARROW (int32x2_t, int32_t, s32)
+LDR_NARROW (int64x1_t, int64_t, s64)
+
+LDR_NARROW (uint8x8_t, uint8_t, u8)
+LDR_NARROW (uint16x4_t, uint16_t, u16)
+LDR_NARROW (uint32x2_t, uint32_t, u32)
+LDR_NARROW (uint64x1_t, uint64_t, u64)
+
+LDR_NARROW (float16x4_t, float16_t, f16)
+LDR_NARROW (float32x2_t, float32_t, f32)
+LDR_NARROW (float64x1_t, float64_t, f64)
+
+LDR_NARROW (bfloat16x4_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times "\\tldr" 24 } } */
+/* { dg-final { scan-assembler-not "\\tmov" } } */
-- 
2.44.0

[PATCH] aarch64: Use LDR for first-element loads for Advanced SIMD

Reply via email to