[PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion

Sudakshina Das Tue, 28 Jul 2020 06:29:09 -0700

Hi

This is my attempt at reviving the old patch 
https://gcc.gnu.org/pipermail/gcc-patches/2019-January/514632.html


I have followed on Kyrill's comment upstream on the link above and I am using 
the recommended option iii that he mentioned.
"1) Adjust the copy_limit to 256 bits after checking 
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS in the tuning.
 2) Adjust aarch64_copy_one_block_and_progress_pointers to handle 256-bit 
moves. by iii:
   iii) Emit explicit V4SI (or any other 128-bit vector mode) pairs ldp/stps. 
This wouldn't need any adjustments to
        MD patterns, but would make 
aarch64_copy_one_block_and_progress_pointers more complex as it would now have
        two paths, where one handles two adjacent memory addresses in one 
calls."

With this patch the following test

#define N 8
extern int src[N], dst[N];

void
foo (void)
{
  __builtin_memcpy (dst, src, N * sizeof (int));
}

which was originally giving
foo:
        adrp    x1, src
        add     x1, x1, :lo12:src
        ldp     x4, x5, [x1]
        adrp    x0, dst
        add     x0, x0, :lo12:dst
        ldp     x2, x3, [x1, 16]
        stp     x4, x5, [x0]
        stp     x2, x3, [x0, 16]
        ret


changes to the following
foo:
        adrp    x1, src
        add     x1, x1, :lo12:src
        adrp    x0, dst
        add     x0, x0, :lo12:dst
        ldp     q1, q0, [x1]
        stp     q1, q0, [x0]
        ret

This gives about 1.3% improvement on 523.xalancbmk_r in SPEC2017 and an overall 
code size reduction on most
SPEC2017 Int benchmarks on Neoverse N1 due to more LDP/STP Q pair registers.

Bootstrapped and regression tested on aarch64-none-linux-gnu.

Is this ok for trunk?

Thanks
Sudi

gcc/ChangeLog:

2020-07-23  Sudakshina Das  <sudi....@arm.com>
            Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

        * config/aarch64/aarch64.c (aarch64_gen_store_pair): Add case
        for E_V4SImode.
        (aarch64_gen_load_pair): Likewise.
        (aarch64_copy_one_block_and_progress_pointers): Handle 256 bit copy.
        (aarch64_expand_cpymem): Expand copy_limit to 256bits where
        appropriate.

gcc/testsuite/ChangeLog:

2020-07-23  Sudakshina Das  <sudi....@arm.com>
            Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

        * gcc.target/aarch64/cpymem-q-reg_1.c: New test.
        * gcc.target/aarch64/large_struct_copy_2.c: Update for ldp q regs.

****************************** Attachment inlined  
**********************************

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
3fe1feaa80ccb0a287ee1c7ea1056e8f0a830532..a38ff39c4d5d53f056bbba3114ebaf8f0414c037
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6920,6 +6920,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx 
reg1, rtx mem2,
     case E_TFmode:
       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
 
+    case E_V4SImode:
+      return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+
     default:
       gcc_unreachable ();
     }
@@ -6943,6 +6946,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx 
mem1, rtx reg2,
     case E_TFmode:
       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
 
+    case E_V4SImode:
+      return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
@@ -21097,6 +21103,27 @@ static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
                                              machine_mode mode)
 {
+  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
+     address copies using V4SImode so that we can use Q registers.  */
+  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+    {
+      mode = V4SImode;
+      rtx reg1 = gen_reg_rtx (mode);
+      rtx reg2 = gen_reg_rtx (mode);
+      /* "Cast" the pointers to the correct mode.  */
+      *src = adjust_address (*src, mode, 0);
+      *dst = adjust_address (*dst, mode, 0);
+      /* Emit the memcpy.  */
+      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
+                                       aarch64_progress_pointer (*src)));
+      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
+                                        aarch64_progress_pointer (*dst), 
reg2));
+      /* Move the pointers forward.  */
+      *src = aarch64_move_pointer (*src, 32);
+      *dst = aarch64_move_pointer (*dst, 32);
+      return;
+    }
+
   rtx reg = gen_reg_rtx (mode);
 
   /* "Cast" the pointers to the correct mode.  */
@@ -21150,9 +21177,12 @@ aarch64_expand_cpymem (rtx *operands)
   /* Convert n to bits to make the rest of the code simpler.  */
   n = n * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
-     larger than TImode, but we should not use them for loads/stores here.  */
-  const int copy_limit = GET_MODE_BITSIZE (TImode);
+  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
+     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter and TARGET_SIMD.  */
+  const int copy_limit = ((aarch64_tune_params.extra_tuning_flags
+                          & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
+                         || !TARGET_SIMD)
+                        ? GET_MODE_BITSIZE (TImode) :  256;
 
   while (n > 0)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c 
b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..df5f67e425bd6a52362bc912afbcd1ca3725d449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cpymem-q-reg_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define N 8
+extern int src[N], dst[N];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, N * sizeof (int));
+}
+
+/* { dg-final { scan-assembler {ldp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {ldp\tx[0-9]*} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]*} } } */
+/* { dg-final { scan-assembler-not {stp\tx[0-9]*} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c 
b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
index 
565434244e8296749a99bebc4e095065945d825e..8ee0a9fae92f805573b763ec679f2a8045afb479
 100644
--- a/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/large_struct_copy_2.c
@@ -16,11 +16,10 @@ struct S2 {
   struct S0 f3;
 };
 
-void fn1 () {
-  struct S2 b = {0, 1, 7, 4073709551611, 4, 8, 7};
+void fn1 (struct S2 b) {
   a = b.f3;
 }
 
-/* { dg-final { scan-assembler-times {ldp\s+x[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {stp\s+x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {ldp\s+q[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {stp\s+q[0-9]+} 1 } } */
 /* { dg-final { scan-assembler-not {ld[1-3]} } } */

rb13305.patch
Description: rb13305.patch

[PATCH V2] aarch64: Use Q-reg loads/stores in movmem expansion

Reply via email to