[AArch64] Improve SVE dup intrinsics codegen

Andre Vieira (lists) via Gcc-patches Tue, 17 May 2022 06:29:20 -0700

Hi,

This patch teaches the aarch64 backend to improve codegen when using dupwith NEON vectors with repeating patterns. It will attempt to use asmaller NEON vector (or element) to limit the number of instructionsneeded to construct the input vector.


Bootstrapped and regression tested  aarch64-none-linux-gnu.

Is his OK for trunk?

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_container_mode): Makeit global. * config/aarch64/aarch64-protos.h(aarch64_simd_container_mode): Declare it. * config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg):Rename this to ...

        (@aarch64_vec_duplicae_reg_<mode>): ... this.

* gcc/config/aarch64-sve-builtins-base.cc(svdup_lane_impl::expand): Improve codegen when inputs form a repeatingpattern.


gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/dup_opt.c: New test.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
2ac781dff4a93cbe0f0b091147b2521ed1a88750..cfc31b467cf1d3cd79b2dfe6a54e6910dd43b5d8
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -771,6 +771,7 @@ int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
 bool aarch64_advsimd_struct_mode_p (machine_mode mode);
 opt_machine_mode aarch64_vq_mode (scalar_mode);
+machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 
c24c05487246f529f81867d6429e636fd6dc74d0..f8b755a83dc37578363270618323f87c95fa327f
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -875,13 +875,98 @@ public:
        argument N to go into architectural lane N, whereas Advanced SIMD
        vectors are loaded memory lsb to register lsb.  We therefore need
        to reverse the elements for big-endian targets.  */
-    rtx vq_reg = gen_reg_rtx (vq_mode);
     rtvec vec = rtvec_alloc (elements_per_vq);
     for (unsigned int i = 0; i < elements_per_vq; ++i)
       {
        unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
        RTVEC_ELT (vec, i) = e.args[argno];
       }
+
+    /* Look for a repeating pattern in the 128-bit input as that potentially
+       simplifies constructing the input vector.
+       For example, codegen for svdupq_n_s32 (a, b, a, b), could be simplified
+       from:
+       dup     v0.4s, w0
+       fmov    s1, w1
+       ins     v0.s[1], v1.s[0]
+       ins     v0.s[3], v1.s[0]
+       dup     z0.q, z0.q[0]
+       to:
+       fmov    d0, x0
+       ins     v0.s[1], w1
+       mov     z0.d, d0
+       where we can see it uses a [a, b] input vector reducing the number of
+       needed instructions.  */
+    if  (elements_per_vq > 1 && mode == e.vector_mode(0))
+      {
+       unsigned int new_elements_n = elements_per_vq;
+       bool group = true;
+       while (group && new_elements_n > 1)
+         {
+           for (unsigned int i = 0; i < new_elements_n / 2; ++i)
+             {
+               if (rtx_equal_p (RTVEC_ELT (vec, i),
+                                RTVEC_ELT (vec, new_elements_n / 2 + i)) == 0)
+                 {
+                   group = false;
+                   break;
+                 }
+             }
+           if (group)
+             new_elements_n /= 2;
+         }
+       /* We have found a repeating pattern smaller than 128-bits, so use that
+          instead.  */
+       if (new_elements_n < elements_per_vq)
+         {
+           unsigned int input_size = 128 / elements_per_vq * new_elements_n;
+           scalar_mode new_mode
+             = int_mode_for_size (input_size, 0).require ();
+           rtx input;
+           if (new_elements_n > 1)
+             {
+               if (input_size < 64)
+                 {
+                   /* TODO: Remove this when support for 32- and 16-bit vectors
+                      is added.
+                      */
+                   new_elements_n *= 64/input_size;
+                   input_size = 64;
+                   new_mode = int_mode_for_size (input_size, 0).require ();
+                 }
+               input = gen_reg_rtx (new_mode);
+               rtvec new_vec = rtvec_alloc (new_elements_n);
+               for (unsigned int i = 0; i < new_elements_n; ++i)
+                 RTVEC_ELT (new_vec, i) = RTVEC_ELT (vec, i);
+
+               machine_mode merge_mode
+                 = aarch64_simd_container_mode (element_mode, input_size);
+
+               rtx merge_subreg = simplify_gen_subreg (merge_mode, input,
+                                                       new_mode, 0);
+               aarch64_expand_vector_init (merge_subreg,
+                                           gen_rtx_PARALLEL (merge_mode,
+                                                             new_vec));
+             }
+           else
+             input = simplify_gen_subreg (new_mode, RTVEC_ELT (vec, 0),
+                                          element_mode, 0);
+           machine_mode sve_mode
+             = aarch64_full_sve_mode (new_mode).require ();
+
+           rtx target = simplify_gen_subreg (sve_mode, e.possible_target,
+                                             mode, 0);
+
+           expand_operand ops[2];
+           create_output_operand (&ops[0], target, sve_mode);
+           create_fixed_operand (&ops[1], input);
+           expand_insn (code_for_aarch64_vec_duplicate_reg (sve_mode), 2,
+                        ops);
+           return e.possible_target;
+         }
+      }
+
+    rtx vq_reg = gen_reg_rtx (vq_mode);
     aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
 
     /* If the result is a boolean, compare the data vector against zero.  */
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
bd60e65b0c3f05f1c931f03807170f3b9d699de5..a7d6041bcda03318ff10f6d425889801b9a8fa63
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2508,7 +2508,7 @@ (define_expand "vec_duplicate<mode>"
 ;; the scalar input gets spilled to memory during RA.  We want to split
 ;; the load at the first opportunity in order to allow the PTRUE to be
 ;; optimized with surrounding code.
-(define_insn_and_split "*vec_duplicate<mode>_reg"
+(define_insn_and_split "@aarch64_vec_duplicate_reg_<mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
        (vec_duplicate:SVE_ALL
          (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
f650abbc4ce49cf0947049931f86bad1130c3428..f5e66a43ec5d47e6f5d5540cb41fba0e0e9f92d6
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -301,7 +301,6 @@ static bool aarch64_builtin_support_vector_misalignment 
(machine_mode mode,
                                                         const_tree type,
                                                         int misalignment,
                                                         bool is_packed);
-static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
                                            aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
@@ -20502,7 +20501,7 @@ aarch64_vq_mode (scalar_mode mode)
 
 /* Return appropriate SIMD container
    for MODE within a vector of WIDTH bits.  */
-static machine_mode
+machine_mode
 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 {
   if (TARGET_SVE
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c 
b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
new file mode 100644
index 
0000000000000000000000000000000000000000..66a1fcfb585b2c2b36a1344d4a33811257188dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_opt.c
@@ -0,0 +1,203 @@
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** float32_0:
+**     ins     v0.s\[1\], v1.s\[0\]
+**     mov     z0.d, d0
+**     ret
+*/
+svfloat32_t float32_0(float x, float y)
+{
+    return svdupq_n_f32(x, y, x, y);
+}
+
+/*
+** float32_1:
+**     mov     z0.s, s0
+**     ret
+*/
+
+svfloat32_t float32_1(float x)
+{
+    return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** float16_0:
+**     ins     v0.h\[1\], v1.h\[0\]
+**     ins     v0.h\[2\], v2.h\[0\]
+**     ins     v0.h\[3\], v3.h\[0\]
+**     mov     z0.d, d0
+**     ret
+*/
+
+svfloat16_t float16_0 (float16_t a, float16_t b, float16_t c, float16_t d)
+{
+  return svdupq_n_f16 (a, b, c, d, a, b, c, d);
+}
+
+/*
+** float16_1:
+**     dup     v0.4h, v0.h\[0\]
+**     ins     v0.h\[1\], v1.h\[0\]
+**     ins     v0.h\[3\], v1.h\[0\]
+**     mov     z0.d, d0
+**     ret
+*/
+
+svfloat16_t float16_1 (float16_t a, float16_t b)
+{
+  return svdupq_n_f16 (a, b, a, b, a, b, a, b);
+}
+
+/*
+** float16_2:
+**     mov     z0.h, h0
+**     ret
+*/
+
+svfloat16_t float16_2 (float16_t a)
+{
+  return svdupq_n_f16 (a, a, a, a, a, a, a, a);
+}
+
+/*
+** int64_0:
+**     mov     z0.d, x0
+**     ret
+*/
+
+svint64_t int64_0 (int64_t a)
+{
+    return svdupq_n_s64 (a, a);
+}
+
+/*
+** int32_0:
+**     fmov    d0, x0
+**     ins     v0.s\[1\], w1
+**     mov     z0.d, d0
+**     ret
+*/
+
+svuint32_t int32_0(uint32_t a, uint32_t b) {
+    return svdupq_n_u32(a, b, a, b);
+}
+
+/*
+** int32_1:
+**     mov     z0.s, w0
+**     ret
+*/
+
+svint32_t int32_1(int32_t a)
+{
+    return svdupq_n_s32(a, a, a, a);
+}
+
+/*
+** int16_0:
+**     ...
+**     fmov    d0, x0
+**     ins     v0.h\[1\], w1
+**     ins     v0.h\[2\], w2
+**     ins     v0.h\[3\], w3
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint16_t int16_0(int16_t a, int16_t b, int16_t c, int16_t d)
+{
+    return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** int16_1:
+**     dup     v0.4h, w0
+**     ins     v0.h\[1\], w1
+**     ins     v0.h\[3\], w1
+**     mov     z0.d, d0
+**     ret
+*/
+
+svuint16_t int16_1(uint16_t a, uint16_t b)
+{
+    return svdupq_n_u16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** int16_2:
+**     mov     z0.h, w0
+**     ret
+*/
+
+svint16_t int16_2(int16_t a)
+{
+    return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** int8_0:
+**     ...
+**     fmov    d0, x0
+**     ins     v0.b\[1\], w1
+**     ins     v0.b\[2\], w2
+**     ins     v0.b\[3\], w3
+**     ins     v0.b\[4\], w4
+**     ins     v0.b\[5\], w5
+**     ins     v0.b\[6\], w6
+**     ins     v0.b\[7\], w7
+**     mov     z0.d, d0
+**     ret
+*/
+
+svuint8_t int8_0(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, 
uint8_t f, uint8_t g, uint8_t h)
+{
+    return svdupq_n_u8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** int8_1:
+**     dup     v0.8b, w0
+**     ins     v0.b\[1\], w1
+**     ins     v0.b\[2\], w2
+**     ins     v0.b\[3\], w3
+**     ins     v0.b\[5\], w1
+**     ins     v0.b\[6\], w2
+**     ins     v0.b\[7\], w3
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint8_t int8_1(int8_t a, int8_t b, int8_t c, int8_t d)
+{
+    return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+/*
+** int8_2:
+**     dup     v0.8b, w0
+**     ins     v0.b\[1\], w1
+**     ins     v0.b\[3\], w1
+**     ins     v0.b\[5\], w1
+**     ins     v0.b\[7\], w1
+**     mov     z0.d, d0
+**     ret
+*/
+
+svint8_t int8_2(int8_t a, int8_t b)
+{
+    return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** int8_3:
+**     mov     z0.b, w0
+**     ret
+*/
+
+svint8_t int8_3(int8_t a)
+{
+    return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}

[AArch64] Improve SVE dup intrinsics codegen

Reply via email to