[PATCH] aarch64: Use SVE ASRD instruction with Neon modes.

Soumya AR Sun, 24 Nov 2024 20:29:52 -0800

The ASRD instruction on SVE performs an arithmetic shift right by an immediate
for divide.


This patch enables the use of ASRD with Neon modes.

For example:

int in[N], out[N];

void
foo (void)
{
  for (int i = 0; i < N; i++)
    out[i] = in[i] / 4;
}

compiles to:

        ldr     q31, [x1, x0]
        cmlt    v30.16b, v31.16b, #0
        and     z30.b, z30.b, 3
        add     v30.16b, v30.16b, v31.16b
        sshr    v30.16b, v30.16b, 2
        str     q30, [x0, x2]
        add     x0, x0, 16
        cmp     x0, 1024

but can just be:

        ldp     q30, q31, [x0], 32
        asrd    z31.b, p7/m, z31.b, #2
        asrd    z30.b, p7/m, z30.b, #2
        stp     q30, q31, [x1], 32
        cmp     x0, x2

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Soumya AR <soum...@nvidia.com>

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3 and
        *sdiv_pow2<mode>3 to support Neon modes.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/sve-asrd.c: New test.
---
 gcc/config/aarch64/aarch64-sve.md             | 25 ++++-----
 .../gcc.target/aarch64/sve/sve-asrd.c         | 54 +++++++++++++++++++
 2 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index affdb24a93d..96effe4abed 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -4972,34 +4972,35 @@
 
 ;; Unpredicated ASRD.
 (define_expand "sdiv_pow2<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-       (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+       (unspec:SVE_VDQ_I
          [(match_dup 3)
-          (unspec:SVE_I
-            [(match_operand:SVE_I 1 "register_operand")
+          (unspec:SVE_VDQ_I
+            [(match_operand:SVE_VDQ_I 1 "register_operand")
              (match_operand 2 "aarch64_simd_rshift_imm")]
             UNSPEC_ASRD)]
         UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
-    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode,
+                                   GET_MODE_UNIT_SIZE (<MODE>mode));
   }
 )
 
 ;; Predicated ASRD.
 (define_insn "*sdiv_pow2<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-       (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+       (unspec:SVE_VDQ_I
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_I
-            [(match_operand:SVE_I 2 "register_operand")
-             (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
+          (unspec:SVE_VDQ_I
+            [(match_operand:SVE_VDQ_I 2 "register_operand")
+             (match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
             UNSPEC_ASRD)]
          UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
-     [ w        , Upl , 0 ; *              ] asrd\t%0.<Vetype>, %1/m, 
%0.<Vetype>, #%3
-     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, 
%2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+     [ w        , Upl , 0 ; *              ] asrd\t%Z0.<Vetype>, %1/m, 
%Z0.<Vetype>, #%3
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0, 
%Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
   }
 )
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c 
b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
new file mode 100644
index 00000000000..00aa8b2380d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+#define N 1024
+
+#define FUNC(M)                     \
+M in_##M[N];                        \
+M out_##M[N];                       \
+void asrd_##M() {                   \
+  for (int i = 0; i < N; i++)       \
+    out_##M[i] = in_##M[i] / 4;     \
+}
+
+/*
+** asrd_int8_t:
+**     ...
+**     ptrue   (p[0-7]).b, vl1
+**     ...
+**     asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
+**     ...
+*/
+FUNC(int8_t)
+
+/*
+** asrd_int16_t:
+**     ...
+**     ptrue   (p[0-7]).b, vl2
+**     ...
+**     asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
+**     ...
+*/
+FUNC(int16_t)
+
+/*
+** asrd_int32_t:
+**     ...
+**     ptrue   (p[0-7]).b, vl4
+**     ...
+**     asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
+**     ...
+*/
+FUNC(int32_t)
+
+/*
+** asrd_int64_t:
+**     ...
+**     ptrue   (p[0-7]).b, vl8
+**     ...
+**     asrd    z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
+**     ...
+*/
+FUNC(int64_t)
-- 
2.43.2

[PATCH] aarch64: Use SVE ASRD instruction with Neon modes.

Reply via email to