STP [PR113618]

Wilco Dijkstra Thu, 01 Feb 2024 05:46:11 -0800

The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  SPEC2017 performance improves slightly.  Codesize is a bit
worse due to missed LDP opportunities as discussed in the PR.


Passes regress, OK for commit?

gcc/ChangeLog:
        PR target/113618
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
        (aarch64_expand_cpymem): Emit single load/store only.
        (aarch64_set_one_block): Remove.
        (aarch64_expand_setmem): Emit single stores only.

gcc/testsuite/ChangeLog:
        PR target/113618
        * gcc.target/aarch64/pr113618.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
d17198b4a5f73f8be8aeca3258b81809ffb48eac..2194441b949a53f181fe373e07bc18341c014918
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26376,33 +26376,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
                                    next, amount);
 }
 
-typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
-                       int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V4SImode;
-      rtx src1 = adjust_address (src, mode, offset);
-      rtx dst1 = adjust_address (dst, mode, offset);
-      rtx reg1 = gen_reg_rtx (mode);
-      rtx reg2 = gen_reg_rtx (mode);
-      rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-      rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-      ops.safe_push ({ load, store });
-      return;
-    }
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
@@ -26438,7 +26411,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26465,7 +26438,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26473,7 +26446,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec<std::pair<rtx, rtx>, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26482,23 +26455,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
         or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
-         cur_mode = mode_iter.require ();
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+         mode = mode_iter.require ();
+
+      gcc_assert (mode != BLKmode);
 
-      gcc_assert (cur_mode != BLKmode);
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      /* Prefer Q-register accesses.  */
+      if (mode_bytes == 16 && use_qregs)
+       mode = V4SImode;
 
-      /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bytes == 16 && copy_max == 32)
-       cur_mode = V4SImode;
-      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      rtx reg = gen_reg_rtx (mode);
+      rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+      rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+      ops.safe_push ({ load, store });
       size -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
+      if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
        {
          next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
          int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
@@ -26510,7 +26487,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 
   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
   int nops = ops.length();
-  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+  int inc = is_memmove || nops <= 8 ? nops : 6;
 
   for (int i = 0; i < nops; i += inc)
     {
@@ -26525,23 +26502,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
-static void
-aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
-{
-  /* Emit explict store pair instructions for 32-byte writes.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-    {
-      mode = V16QImode;
-      rtx dst1 = adjust_address (dst, mode, offset);
-      emit_insn (aarch64_gen_store_pair (dst1, src, src));
-      return;
-    }
-  if (known_lt (GET_MODE_SIZE (mode), 16))
-    src = lowpart_subreg (mode, src, GET_MODE (src));
-  emit_move_insn (adjust_address (dst, mode, offset), src);
-}
-
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
    as for the setmem pattern.  Return true iff we succeed.  */
 static bool
@@ -26574,7 +26534,7 @@ aarch64_expand_setmem (rtx *operands)
   rtx val = operands[2], src;
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
@@ -26595,11 +26555,8 @@ aarch64_expand_setmem (rtx *operands)
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
-  src = expand_vector_broadcast (V16QImode, val);
-  src = force_reg (V16QImode, src);
-
-  /* Set maximum number of bytes to write per instruction.  */
-  unsigned set_max = (len <= 24) ? 16 : 32;
+  val = expand_vector_broadcast (V16QImode, val);
+  val = force_reg (V16QImode, val);
 
   int offset = 0;
   while (len > 0)
@@ -26608,24 +26565,28 @@ aarch64_expand_setmem (rtx *operands)
         over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
-         cur_mode = mode_iter.require ();
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
+         mode = mode_iter.require ();
+
+      gcc_assert (mode != BLKmode);
 
-      gcc_assert (cur_mode != BLKmode);
+      mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+      src = val;
 
-      /* Prefer Q-register accesses for the last bytes.  */
+      /* Prefer Q-register accesses.  */
       if (mode_bytes == 16)
-       cur_mode = V16QImode;
+       mode = V16QImode;
+      else
+       src = lowpart_subreg (mode, src, GET_MODE (val));
 
-      aarch64_set_one_block (src, dst, offset, cur_mode);
+      emit_move_insn (adjust_address (dst, mode, offset), src);
       len -= mode_bytes;
       offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
+      if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
        {
          next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
          int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113618.c 
b/gcc/testsuite/gcc.target/aarch64/pr113618.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f582360e7c1d149ad94c78c3e66f1c4a973750bb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113618.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+
+/*
+** move32:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     stp     q\1, q\2, \[x1\]
+**     ...
+*/
+
+void move32 (char *a, char *b)
+{
+    char temp[32];
+    __builtin_memcpy (temp, a, 32);
+    __builtin_memcpy (b, temp, 32);
+}
+
+/*
+** move64:
+**     ...
+**     ldp     q([0-9]+), q([0-9]+), \[x0\]
+**     ldp     q([0-9]+), q([0-9]+), \[x0, 32\]
+**     stp     q\1, q\2, \[x1\]
+**     stp     q\3, q\4, \[x1, 32\]
+**     ...
+*/
+
+void move64 (char *a, char *b)
+{
+    char temp[64];
+    __builtin_memcpy (temp, a, 64);
+    __builtin_memcpy (b, temp, 64);
+}

[PATCH] AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

Reply via email to