On Tue, 15 Aug 2023 at 15:06, Jeff Law <jeffreya...@gmail.com> wrote:
 >
> On 8/15/23 03:16, juzhe.zh...@rivai.ai wrote:
> > The new  patch looks reasonable to me now. Thanks for fixing it.
> >
> > Could you append testcase after finishing test infrastructure ?
> > I prefer this patch with testcase after infrastructure.
> So let's call this an ACK, but ask that Joern not commit until the
> testsuite bits are in place.

Beyond the adding of tests, the patch needed some changes because of the
Refactoring of emit_{vlmax,nonvlmax}_xxx functions .
Attached is the committed version.
commit 9464e72bcc9123b619215af8cfef491772a3ebd9
Author: Joern Rennecke <joern.renne...@embecosm.com>
Date:   Mon Oct 2 03:16:09 2023 +0100

    cpymem for RISC-V with v extension
    
    gcc/
            * config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
            Declare.
            * config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
            New function.
            * config/riscv/riscv.md (cpymemsi): Use 
riscv_vector::expand_block_move.
            Change to ..
            (cpymem<P:mode>) .. this.
    
    gcc/testsuite/
            * gcc.target/riscv/rvv/base/cpymem-1.c: New test.
            * gcc.target/riscv/rvv/base/cpymem-2.c: Likewise.
    
    Co-Authored-By: Juzhe-Zhong <juzhe.zh...@rivai.ai>

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index af5baf37e6a..43426a5326b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -492,6 +492,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
                          machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
 void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
 machine_mode preferred_simd_mode (scalar_mode);
 machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 097457562bd..29e138e1da2 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
 #include "targhooks.h"
+#include "predict.h"
 
 using namespace riscv_vector;
 
@@ -1991,6 +1992,206 @@ expand_tuple_move (rtx *ops)
     }
 }
 
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+       mv a3, a0                       # Copy destination
+    loop:
+       vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+       vle8.v v0, (a1)                 # Load bytes
+       add a1, a1, t0                  # Bump pointer
+       sub a2, a2, t0                  # Decrement count
+       vse8.v v0, (a3)                 # Store bytes
+       add a3, a3, t0                  # Bump pointer
+       bnez a2, loop                   # Any more?
+       ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+       need_loop = false;
+
+       /* If a single scalar load / store pair can do the job, leave it
+          to the scalar code to do that.  */
+       /* ??? If fast unaligned access is supported, the scalar code could
+          use suitably sized scalars irrespective of alignemnt.  If that
+          gets fixed, we have to adjust the test here.  */
+
+       if (pow2p_hwi (length) && length <= potential_ew)
+         return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+        size is likely to give smaller constants, and thus potentially
+        reducing code size.  However, if we need a loop, we need to update
+        the pointers, and that is more complicated with a larger element
+        size, unless we use an immediate, which prevents us from dynamically
+        using the targets transfer size that the hart supports.  And then,
+        unless we know the *exact* vector size of the hart, we'd need
+        multiple vsetvli / branch statements, so it's not even a size win.
+        If, in the future, we find an RISCV-V implementation that is slower
+        for small element widths, we might allow larger element widths for
+        loops too.  */
+      if (need_loop)
+       potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+       {
+         scalar_int_mode elem_mode;
+         unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+         unsigned HOST_WIDE_INT per_iter;
+         HOST_WIDE_INT nunits;
+
+         if (need_loop)
+           per_iter = TARGET_MIN_VLEN;
+         else
+           per_iter = length;
+         nunits = per_iter / potential_ew;
+
+         /* Unless we get an implementation that's slow for small element
+            size / non-word-aligned accesses, we assume that the hardware
+            handles this well, and we don't want to complicate the code
+            with shifting word contents around or handling extra bytes at
+            the start and/or end.  So we want the total transfer size and
+            alignment to fit with the element size.  */
+         if (length % potential_ew != 0
+             || !int_mode_for_size (bits, 0).exists (&elem_mode))
+           continue;
+         /* Find the mode to use for the copy inside the loop - or the
+            sole copy, if there is no loop.  */
+         if (!need_loop)
+           {
+             /* Try if we have an exact mode for the copy.  */
+             if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+               break;
+             /* Since we don't have a mode that exactlty matches the transfer
+                size, we'll need to use pred_store, which is not available
+                for all vector modes, but only iE_RVV_M* modes, hence trying
+                to find a vector mode for a merely rounded-up size is
+                pointless.
+                Still, by choosing a lower LMUL factor that still allows
+                an entire transfer, we can reduce register pressure.  */
+             for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+               if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+                   /* Avoid loosing the option of using vsetivli .  */
+                   && (nunits <= 31 * lmul || nunits > 31 * 8)
+                   && (get_vector_mode
+                        (elem_mode,
+                         exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+                                    potential_ew)
+                         ).exists (&vmode)))
+                 break;
+           }
+
+         /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+            wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+            the sizes of larger element types; the LMUL factor of 8 can at
+            the moment be divided by the SEW, with SEW of up to 8 bytes,
+            but there are reserved encodings so there might be larger
+            SEW in the future.  */
+         if (get_vector_mode (elem_mode,
+                              exact_div (BYTES_PER_RISCV_VECTOR * 8,
+                                         potential_ew)).exists (&vmode))
+           break;
+
+         /* We may get here if we tried an element size that's larger than
+            the hardware supports, but we should at least find a suitable
+            byte vector mode.  */
+         gcc_assert (potential_ew > 1);
+       }
+      if (potential_ew > 1)
+       length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = E_RVVM8QImode;
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  /* length_rtx holds the (remaining) length of the required copy.
+     cnt holds the length we copy with the current load/store pair.  */
+  rtx cnt = length_rtx;
+  rtx label = NULL_RTX;
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+  if (need_loop)
+    {
+      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+      cnt = gen_reg_rtx (Pmode);
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
+    }
+
+  vec = gen_reg_rtx (vmode);
+  src = change_address (src_in, vmode, src_addr);
+  dst = change_address (dst_in, vmode, dst_addr);
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+    }
+  else
+    {
+      machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS 
(vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      if (!satisfies_constraint_K (cnt))
+       cnt= force_reg (Pmode, cnt);
+      rtx m_ops[] = {vec, mask, src};
+      emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
+                         m_ops, cnt);
+      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+                                get_avl_type_rtx (NONVLMAX)));
+    }
+
+  if (need_loop)
+    {
+      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, 
cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, 
label));
+      emit_insn (gen_nop ());
+    }
+
+  return true;
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index e00b8ee3579..1ebe8f92284 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2271,14 +2271,16 @@
   DONE;
 })
 
-(define_expand "cpymemsi"
+(define_expand "cpymem<mode>"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
                   (match_operand:BLK 1 "general_operand"))
-             (use (match_operand:SI 2 ""))
+             (use (match_operand:P 2 ""))
              (use (match_operand:SI 3 "const_int_operand"))])]
   ""
 {
-  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+  if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
new file mode 100644
index 00000000000..9bb4904e8e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#if 0 /* Using include files when using a multilib-relevant -march option is 
dicey */
+#include <string.h>
+#else
+extern void *memcpy(void *__restrict dest, const void *__restrict src, 
__SIZE_TYPE__ n);
+#endif
+
+/* memcpy should be implemented using the cpymem pattern.
+** f1:
+XX     \.L\d+: # local label is ignored
+**     vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**     vle8\.v\s+v\d+,0\(a1\)
+**     vse8\.v\s+v\d+,0\(a0\)
+**     add\s+a1,a1,[ta][0-7]
+**     add\s+a0,a0,[ta][0-7]
+**     sub\s+a2,a2,[ta][0-7]
+**     bne\s+a2,zero,\.L\d+
+**     ret
+*/
+
+void f1 (void *a, void *b, __SIZE_TYPE__ l)
+{
+  memcpy (a, b, l);
+}
+
+/* We should still use cpymem even with slightly different types, as signed
+   overflow is undefined.
+** f2:
+XX     \.L\d+: # local label is ignored
+**     vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**     vle8\.v\s+v\d+,0\(a1\)
+**     vse8\.v\s+v\d+,0\(a0\)
+**     add\s+a1,a1,[ta][0-7]
+**     add\s+a0,a0,[ta][0-7]
+**     sub\s+a2,a2,[ta][0-7]
+**     bne\s+a2,zero,\.L\d+
+**     ret
+*/
+void f2 (__INT32_TYPE__* a, __INT32_TYPE__* b, int l)
+{
+  memcpy (a, b, l);
+}
+
+/* If it is known that the pointer arguments to memcpy point
+   to an aligned object, cpymem can use that alignment.
+   Use extern here so that we get a known alignment, lest
+   DATA_ALIGNMENT force us to make the scan pattern accomodate
+   code for different alignments depending on word size.
+** f3:
+**        lui\s+[ta][0-7],%hi\(a_a\)
+**        lui\s+[ta][0-7],%hi\(a_b\)
+**        addi\s+a4,[ta][0-7],%lo\(a_b\)
+**        vsetivli\s+zero,16,e32,m4,ta,ma
+**        vle32.v\s+v\d+,0\([ta][0-7]\)
+**        addi\s+[ta][0-7],[ta][0-7],%lo\(a_a\)
+**        vse32\.v\s+v\d+,0\([ta][0-7]\)
+**        ret
+*/
+
+extern struct { __INT32_TYPE__ a[16]; } a_a, a_b;
+
+void f3 ()
+{
+  memcpy (&a_a, &a_b, sizeof a_a);
+}
+
+/* { dg-final { scan-assembler-not {\m(tail|call)\s+memcpy\M} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
new file mode 100644
index 00000000000..7b706b6ef52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef struct { char c[16]; } c16;
+typedef struct { char c[32]; } c32;
+typedef struct { short s; char c[30]; } s16;
+
+/* A short struct copy can use vsetivli.
+** f1:
+**     vsetivli\s+zero,16,e8,m1,ta,ma
+**     vle8.v\s+v1,0\(a1\)
+**     vse8.v\s+v1,0\(a0\)
+**     ret
+*/
+void f1 (c16 *a, c16* b)
+{
+  *a = *b;
+}
+
+/* A longer one needs li.
+** f2:
+**     li\s+[ta][0-7],32
+**     vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
+**     vle8.v\s+v2,0\(a1\)
+**     vse8.v\s+v2,0\(a0\)
+**     ret
+*/
+void f2 (c32 *a, c32* b)
+{
+  *a = *b;
+}
+
+/* A 32 byte struct is still short enough for vsetivli
+   if we can use an element width larger than 8.
+** f3:
+**     vsetivli\s+zero,16,e16,m2,ta,ma
+**     vle16.v\s+v2,0\(a1\)
+**     vse16.v\s+v2,0\(a0\)
+**     ret
+*/
+void f3 (s16 *a, s16* b)
+{
+  *a = *b;
+}

Reply via email to