Re: Re: [PATCH] RISC-V: Add rawmemchr expander.

钟居哲 Fri, 27 Oct 2023 04:40:11 -0700

It seems that you didn't commit it yet.

A nit comment:
+  int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul;
I change you could use TARGET_MAX_LMUL


https://gcc.gnu.org/git/?p=gcc.git;a=blobdiff;f=gcc/config/riscv/riscv-opts.h;h=532b1b6b84a0fff7d4755507cef32193276c0c3f;hp=e557f70f414b21fb1ac7535504653f896191238b;hb=446efa52a8cadb56d1d994da5c4de394efaff462;hpb=e37bc2cf00671e3bc4d82f2627330c0f885a6f29
 



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-10-27 16:50
To: juzhe.zh...@rivai.ai; kito.cheng
CC: rdapp.gcc; gcc-patches; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Add rawmemchr expander.
Attached v3 that I'd commit.
 
Regards
Robin
 
From 246b986a8ea2332ced7a094dd68d35d84dcbbc04 Mon Sep 17 00:00:00 2001
From: Robin Dapp <rd...@ventanamicro.com>
Date: Tue, 24 Oct 2023 10:33:15 +0200
Subject: [PATCH v3] RISC-V: Add rawmemchr expander.
 
This patch adds a vectorized rawmemchr expander.  It also moves the
vectorized expand_block_move to riscv-string.cc.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (rawmemchr<ANYI:mode>): New expander.
* config/riscv/riscv-protos.h (gen_no_side_effects_vsetvl_rtx):
Define.
(expand_rawmemchr): Define.
* config/riscv/riscv-v.cc (force_vector_length_operand): Remove
static.
(expand_block_move): Move from here...
* config/riscv/riscv-string.cc (expand_block_move): ...to here.
(expand_rawmemchr): Add vectorized expander.
* internal-fn.cc (expand_RAWMEMCHR): Fix typo.
 
gcc/testsuite/ChangeLog:
 
* gcc.dg/tree-prof/peel-2.c: Add
-fno-tree-loop-distribute-patterns.
* gcc.dg/tree-ssa/ldist-rawmemchr-1.c: Add riscv.
* gcc.dg/tree-ssa/ldist-rawmemchr-2.c: Ditto.
* gcc.target/riscv/rvv/rvv.exp: Add builtin directory.
* gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c: New test.
---
gcc/config/riscv/autovec.md                   |  13 +
gcc/config/riscv/riscv-protos.h               |   2 +
gcc/config/riscv/riscv-string.cc              | 302 ++++++++++++++++++
gcc/config/riscv/riscv-v.cc                   | 202 +-----------
gcc/internal-fn.cc                            |   2 +-
gcc/testsuite/gcc.dg/tree-prof/peel-2.c       |   2 +-
.../gcc.dg/tree-ssa/ldist-rawmemchr-1.c       |   8 +-
.../gcc.dg/tree-ssa/ldist-rawmemchr-2.c       |   8 +-
.../riscv/rvv/autovec/builtin/rawmemchr-1.c   |  99 ++++++
gcc/testsuite/gcc.target/riscv/rvv/rvv.exp    |   2 +
10 files changed, 429 insertions(+), 211 deletions(-)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1ddc1993120..4f13494afdb 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2397,3 +2397,16 @@ (define_expand "lfloor<mode><v_i_l_ll_convert>2"
     DONE;
   }
)
+
+;; Implement rawmemchr[qi|si|hi].
+(define_expand "rawmemchr<ANYI:mode>"
+  [(match_operand      0 "register_operand")
+   (match_operand      1 "memory_operand")
+   (match_operand:ANYI 2 "const_int_operand")]
+  "TARGET_VECTOR"
+  {
+    riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1],
+    operands[2]);
+    DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 843a81b0e86..44189ec8139 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -495,6 +495,7 @@ void expand_vec_lfloor (rtx, rtx, machine_mode, 
machine_mode);
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
  bool, void (*)(rtx *, rtx));
rtx gen_scalar_move_mask (machine_mode);
+rtx gen_no_side_effects_vsetvl_rtx (machine_mode, rtx, rtx);
/* RVV vector register sizes.
    TODO: Currently, we only add RVV_32/RVV_64/RVV_128, we may need to
@@ -526,6 +527,7 @@ void expand_cond_unop (unsigned, rtx *);
void expand_cond_binop (unsigned, rtx *);
void expand_cond_ternop (unsigned, rtx *);
void expand_popcount (rtx *);
+void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
/* Rounding mode bitfield for fixed point VXRM.  */
enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 0b4606aa7b2..75c2acaf9b1 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -36,6 +36,9 @@
#include "target.h"
#include "predict.h"
#include "optabs.h"
+#include "riscv-protos.h"
+#include "recog.h"
+#include "tm-constrs.h"
/* Emit proper instruction depending on mode of dest.  */
@@ -747,3 +750,302 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length)
     }
   return false;
}
+
+/* --- Vector expanders --- */
+
+namespace riscv_vector {
+
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+ mv a3, a0                       # Copy destination
+    loop:
+ vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+ vle8.v v0, (a1)                 # Load bytes
+ add a1, a1, t0                  # Bump pointer
+ sub a2, a2, t0                  # Decrement count
+ vse8.v v0, (a3)                 # Store bytes
+ add a3, a3, t0                  # Bump pointer
+ bnez a2, loop                   # Any more?
+ ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+ need_loop = false;
+
+ /* If a single scalar load / store pair can do the job, leave it
+    to the scalar code to do that.  */
+ /* ??? If fast unaligned access is supported, the scalar code could
+    use suitably sized scalars irrespective of alignemnt.  If that
+    gets fixed, we have to adjust the test here.  */
+
+ if (pow2p_hwi (length) && length <= potential_ew)
+   return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+ size is likely to give smaller constants, and thus potentially
+ reducing code size.  However, if we need a loop, we need to update
+ the pointers, and that is more complicated with a larger element
+ size, unless we use an immediate, which prevents us from dynamically
+ using the targets transfer size that the hart supports.  And then,
+ unless we know the *exact* vector size of the hart, we'd need
+ multiple vsetvli / branch statements, so it's not even a size win.
+ If, in the future, we find an RISCV-V implementation that is slower
+ for small element widths, we might allow larger element widths for
+ loops too.  */
+      if (need_loop)
+ potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+ {
+   scalar_int_mode elem_mode;
+   unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+   unsigned HOST_WIDE_INT per_iter;
+   HOST_WIDE_INT nunits;
+
+   if (need_loop)
+     per_iter = TARGET_MIN_VLEN;
+   else
+     per_iter = length;
+   nunits = per_iter / potential_ew;
+
+   /* Unless we get an implementation that's slow for small element
+      size / non-word-aligned accesses, we assume that the hardware
+      handles this well, and we don't want to complicate the code
+      with shifting word contents around or handling extra bytes at
+      the start and/or end.  So we want the total transfer size and
+      alignment to fit with the element size.  */
+   if (length % potential_ew != 0
+       || !int_mode_for_size (bits, 0).exists (&elem_mode))
+     continue;
+   /* Find the mode to use for the copy inside the loop - or the
+      sole copy, if there is no loop.  */
+   if (!need_loop)
+     {
+       /* Try if we have an exact mode for the copy.  */
+       if (riscv_vector::get_vector_mode (elem_mode,
+ nunits).exists (&vmode))
+ break;
+       /* Since we don't have a mode that exactlty matches the transfer
+ size, we'll need to use pred_store, which is not available
+ for all vector modes, but only iE_RVV_M* modes, hence trying
+ to find a vector mode for a merely rounded-up size is
+ pointless.
+ Still, by choosing a lower LMUL factor that still allows
+ an entire transfer, we can reduce register pressure.  */
+       for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+ if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+     /* Avoid loosing the option of using vsetivli .  */
+     && (nunits <= 31 * lmul || nunits > 31 * 8)
+     && (riscv_vector::get_vector_mode
+ (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+      potential_ew)).exists (&vmode)))
+   break;
+     }
+
+   /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+      wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+      the sizes of larger element types; the LMUL factor of 8 can at
+      the moment be divided by the SEW, with SEW of up to 8 bytes,
+      but there are reserved encodings so there might be larger
+      SEW in the future.  */
+   if (riscv_vector::get_vector_mode
+       (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8,
+      potential_ew)).exists (&vmode))
+     break;
+
+   /* We may get here if we tried an element size that's larger than
+      the hardware supports, but we should at least find a suitable
+      byte vector mode.  */
+   gcc_assert (potential_ew > 1);
+ }
+      if (potential_ew > 1)
+ length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = E_RVVM8QImode;
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  /* length_rtx holds the (remaining) length of the required copy.
+     cnt holds the length we copy with the current load/store pair.  */
+  rtx cnt = length_rtx;
+  rtx label = NULL_RTX;
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+  if (need_loop)
+    {
+      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+      cnt = gen_reg_rtx (Pmode);
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt,
+        length_rtx));
+    }
+
+  vec = gen_reg_rtx (vmode);
+  src = change_address (src_in, vmode, src_addr);
+  dst = change_address (dst_in, vmode, dst_addr);
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+    }
+  else
+    {
+      machine_mode mask_mode = riscv_vector::get_vector_mode
+ (BImode, GET_MODE_NUNITS (vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      if (!satisfies_constraint_K (cnt))
+ cnt= force_reg (Pmode, cnt);
+      rtx m_ops[] = {vec, mask, src};
+      emit_nonvlmax_insn (code_for_pred_mov (vmode),
+   riscv_vector::UNARY_OP_TAMA, m_ops, cnt);
+      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+ get_avl_type_rtx (riscv_vector::NONVLMAX)));
+    }
+
+  if (need_loop)
+    {
+      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, 
cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, 
label));
+      emit_insn (gen_nop ());
+    }
+
+  return true;
+}
+
+
+/* Implement rawmemchr<mode> using vector instructions.
+   It can be assumed that the needle is in the haystack, otherwise the
+   behavior is undefined.  */
+
+void
+expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat)
+{
+  /*
+    rawmemchr:
+    loop:
+       vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
+       vle[8,16,32,64]ff.v v8, (a0)  # Load.
+       csrr a1, vl      # Get number of bytes read.
+       vmseq.vx v0, v8, pat      # v0 = (v8 == {pat, pat, ...})
+       vfirst.m a2, v0      # Find first hit.
+       add a0, a0, a1      # Bump pointer.
+       bltz a2, loop      # Not found?
+
+       sub a0, a0, a1      # Go back by a1.
+       shll a2, a2, [0,1,2,3]      # Shift to get byte offset.
+       add a0, a0, a2      # Add the offset.
+
+       ret
+  */
+  gcc_assert (TARGET_VECTOR);
+
+  unsigned int isize = GET_MODE_SIZE (mode).to_constant ();
+  int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul;
+  poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize);
+
+  machine_mode vmode;
+  if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode),
+       nunits).exists (&vmode))
+    gcc_unreachable ();
+
+  machine_mode mask_mode = riscv_vector::get_mask_mode (vmode);
+
+  rtx cnt = gen_reg_rtx (Pmode);
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec = gen_reg_rtx (vmode);
+  rtx mask = gen_reg_rtx (mask_mode);
+
+  /* After finding the first vector element matching the needle, we
+     need to multiply by the vector element width (SEW) in order to
+     return a pointer to the matching byte.  */
+  unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ());
+
+  rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
+
+  rtx loop = gen_label_rtx ();
+  emit_label (loop);
+
+  rtx vsrc = change_address (src, vmode, src_addr);
+
+  /* Emit a first-fault load.  */
+  rtx vlops[] = {vec, vsrc};
+  emit_vlmax_insn (code_for_pred_fault_load (vmode),
+    riscv_vector::UNARY_OP, vlops);
+
+  /* Read how far we read.  */
+  if (Pmode == SImode)
+    emit_insn (gen_read_vlsi (cnt));
+  else
+    emit_insn (gen_read_vldi_zero_extend (cnt));
+
+  /* Compare needle with haystack and store in a mask.  */
+  rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec);
+  rtx vmsops[] = {mask, eq, vec, pat};
+  emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
+       riscv_vector::COMPARE_OP, vmsops, cnt);
+
+  /* Find the first bit in the mask.  */
+  rtx vfops[] = {end, mask};
+  emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+       riscv_vector::CPOP_OP, vfops, cnt);
+
+  /* Bump the pointer.  */
+  emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+
+  /* Emit the loop condition.  */
+  rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx);
+  emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop));
+
+  /*  We overran by CNT, subtract it.  */
+  emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt)));
+
+  /*  We found something at SRC + END * [1,2,4,8].  */
+  emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift))));
+  emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
+}
+
+}
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 3fe8125801b..4374afe6765 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1725,7 +1725,7 @@ force_vector_length_operand (rtx vl)
   return vl;
}
-static rtx
+rtx
gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
{
   unsigned int sew = get_sew (vmode);
@@ -2015,206 +2015,6 @@ expand_tuple_move (rtx *ops)
     }
}
-/* Used by cpymemsi in riscv.md .  */
-
-bool
-expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
-{
-  /*
-    memcpy:
- mv a3, a0                       # Copy destination
-    loop:
- vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
- vle8.v v0, (a1)                 # Load bytes
- add a1, a1, t0                  # Bump pointer
- sub a2, a2, t0                  # Decrement count
- vse8.v v0, (a3)                 # Store bytes
- add a3, a3, t0                  # Bump pointer
- bnez a2, loop                   # Any more?
- ret                             # Return
-  */
-  if (!TARGET_VECTOR)
-    return false;
-  HOST_WIDE_INT potential_ew
-    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
-       / BITS_PER_UNIT);
-  machine_mode vmode = VOIDmode;
-  bool need_loop = true;
-  bool size_p = optimize_function_for_size_p (cfun);
-  rtx src, dst;
-  rtx end = gen_reg_rtx (Pmode);
-  rtx vec;
-  rtx length_rtx = length_in;
-
-  if (CONST_INT_P (length_in))
-    {
-      HOST_WIDE_INT length = INTVAL (length_in);
-
-    /* By using LMUL=8, we can copy as many bytes in one go as there
-       are bits in a vector register.  If the entire block thus fits,
-       we don't need a loop.  */
-    if (length <= TARGET_MIN_VLEN)
-      {
- need_loop = false;
-
- /* If a single scalar load / store pair can do the job, leave it
-    to the scalar code to do that.  */
- /* ??? If fast unaligned access is supported, the scalar code could
-    use suitably sized scalars irrespective of alignemnt.  If that
-    gets fixed, we have to adjust the test here.  */
-
- if (pow2p_hwi (length) && length <= potential_ew)
-   return false;
-      }
-
-      /* Find the vector mode to use.  Using the largest possible element
- size is likely to give smaller constants, and thus potentially
- reducing code size.  However, if we need a loop, we need to update
- the pointers, and that is more complicated with a larger element
- size, unless we use an immediate, which prevents us from dynamically
- using the targets transfer size that the hart supports.  And then,
- unless we know the *exact* vector size of the hart, we'd need
- multiple vsetvli / branch statements, so it's not even a size win.
- If, in the future, we find an RISCV-V implementation that is slower
- for small element widths, we might allow larger element widths for
- loops too.  */
-      if (need_loop)
- potential_ew = 1;
-      for (; potential_ew; potential_ew >>= 1)
- {
-   scalar_int_mode elem_mode;
-   unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
-   unsigned HOST_WIDE_INT per_iter;
-   HOST_WIDE_INT nunits;
-
-   if (need_loop)
-     per_iter = TARGET_MIN_VLEN;
-   else
-     per_iter = length;
-   nunits = per_iter / potential_ew;
-
-   /* Unless we get an implementation that's slow for small element
-      size / non-word-aligned accesses, we assume that the hardware
-      handles this well, and we don't want to complicate the code
-      with shifting word contents around or handling extra bytes at
-      the start and/or end.  So we want the total transfer size and
-      alignment to fit with the element size.  */
-   if (length % potential_ew != 0
-       || !int_mode_for_size (bits, 0).exists (&elem_mode))
-     continue;
-   /* Find the mode to use for the copy inside the loop - or the
-      sole copy, if there is no loop.  */
-   if (!need_loop)
-     {
-       /* Try if we have an exact mode for the copy.  */
-       if (get_vector_mode (elem_mode, nunits).exists (&vmode))
- break;
-       /* Since we don't have a mode that exactlty matches the transfer
- size, we'll need to use pred_store, which is not available
- for all vector modes, but only iE_RVV_M* modes, hence trying
- to find a vector mode for a merely rounded-up size is
- pointless.
- Still, by choosing a lower LMUL factor that still allows
- an entire transfer, we can reduce register pressure.  */
-       for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
- if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
-     /* Avoid loosing the option of using vsetivli .  */
-     && (nunits <= 31 * lmul || nunits > 31 * 8)
-     && (get_vector_mode
- (elem_mode,
-   exact_div (BYTES_PER_RISCV_VECTOR * lmul,
-      potential_ew)
-   ).exists (&vmode)))
-   break;
-     }
-
-   /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
-      wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
-      the sizes of larger element types; the LMUL factor of 8 can at
-      the moment be divided by the SEW, with SEW of up to 8 bytes,
-      but there are reserved encodings so there might be larger
-      SEW in the future.  */
-   if (get_vector_mode (elem_mode,
-        exact_div (BYTES_PER_RISCV_VECTOR * 8,
-   potential_ew)).exists (&vmode))
-     break;
-
-   /* We may get here if we tried an element size that's larger than
-      the hardware supports, but we should at least find a suitable
-      byte vector mode.  */
-   gcc_assert (potential_ew > 1);
- }
-      if (potential_ew > 1)
- length_rtx = GEN_INT (length / potential_ew);
-    }
-  else
-    {
-      vmode = E_RVVM8QImode;
-    }
-
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When RVV should take 7 instructions and
-     we're optimizing for size a libcall may be preferable.  */
-  if (size_p && need_loop)
-    return false;
-
-  /* length_rtx holds the (remaining) length of the required copy.
-     cnt holds the length we copy with the current load/store pair.  */
-  rtx cnt = length_rtx;
-  rtx label = NULL_RTX;
-  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
-  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
-
-  if (need_loop)
-    {
-      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
-      cnt = gen_reg_rtx (Pmode);
-      label = gen_label_rtx ();
-
-      emit_label (label);
-      emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
-    }
-
-  vec = gen_reg_rtx (vmode);
-  src = change_address (src_in, vmode, src_addr);
-  dst = change_address (dst_in, vmode, dst_addr);
-
-  /* If we don't need a loop and have a suitable mode to describe the size,
-     just do a load / store pair and leave it up to the later lazy code
-     motion pass to insert the appropriate vsetvli.  */
-  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
-    {
-      emit_move_insn (vec, src);
-      emit_move_insn (dst, vec);
-    }
-  else
-    {
-      machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS 
(vmode)).require ();
-      rtx mask =  CONSTM1_RTX (mask_mode);
-      if (!satisfies_constraint_K (cnt))
- cnt= force_reg (Pmode, cnt);
-      rtx m_ops[] = {vec, mask, src};
-      emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
-   m_ops, cnt);
-      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
- get_avl_type_rtx (NONVLMAX)));
-    }
-
-  if (need_loop)
-    {
-      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
-      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
-      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, 
cnt)));
-
-      /* Emit the loop condition.  */
-      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
-      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, 
label));
-      emit_insn (gen_nop ());
-    }
-
-  return true;
-}
-
/* Return the vectorization machine mode for RVV according to LMUL.  */
machine_mode
preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index ed83fa8112e..adf84f20a44 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3242,7 +3242,7 @@ expand_VEC_CONVERT (internal_fn, gcall *)
   gcc_unreachable ();
}
-/* Expand IFN_RAWMEMCHAR internal function.  */
+/* Expand IFN_RAWMEMCHR internal function.  */
void
expand_RAWMEMCHR (internal_fn, gcall *stmt)
diff --git a/gcc/testsuite/gcc.dg/tree-prof/peel-2.c 
b/gcc/testsuite/gcc.dg/tree-prof/peel-2.c
index ac417fb3b57..216e6552a58 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/peel-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/peel-2.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops 
-fpeel-loops -fdump-tree-ch2-details-blocks" } */
+/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops 
-fpeel-loops -fdump-tree-ch2-details-blocks -fno-tree-loop-distribute-patterns" 
} */
int a[100];
int n = 1000000;
int zeroc;
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
index bf6335f6360..adf53b10def 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
@@ -1,9 +1,9 @@
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
/* Rawmemchr pattern: reduction stmt and no store */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
index 83f5a35a322..6c8a485a3aa 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
@@ -1,9 +1,9 @@
-/* { dg-do run { target s390x-*-* } } */
+/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
/* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
target s390x-*-* } } } */
-/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
target s390x-*-* } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
target { { s390x-*-* } || { riscv_v } } } } } */
/* Rawmemchr pattern: reduction stmt and store */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
new file mode 100644
index 00000000000..ba83cb3836f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
@@ -0,0 +1,99 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution 
-fdump-tree-ldist-details" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern)     \
+__attribute__((noinline,noclone))  \
+T* rawmemchr_##T (T *s)            \
+{                                  \
+  while (*s != pattern)            \
+    ++s;                           \
+  return s;                        \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern)                           \
+void run_##T ()                                    \
+{                                                  \
+  T *buf = malloc (4096 * 2 * sizeof(T));          \
+  assert (buf != NULL);                            \
+  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
+  /* ensure q is 4096-byte aligned */              \
+  T *q = (T*)((unsigned char *)buf                 \
+              + (4096 - ((uintptr_t)buf & 4095))); \
+  T *p;                                            \
+  /* unaligned + block boundary + 1st load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[2] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
+  p[2] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + block boundary + 2nd load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[6] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
+  p[6] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 1st load */                       \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 2nd load */                       \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 3rd load */                       \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 4th load */                       \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 1st load */                         \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* aligned + 2nd load */                         \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 3rd load */                         \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 4th load */                         \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  free (buf);                                      \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+  run_uint8_t ();
+  run_int8_t ();
+  run_uint16_t ();
+  run_int16_t ();
+  run_uint32_t ();
+  run_int32_t ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp 
b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
index b19aa7b4ae6..9f7a10d5b78 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
+++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
@@ -81,6 +81,8 @@ foreach op $AUTOVEC_TEST_OPTS {
     "" "$op"
   dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/cond/*.\[cS\]]] \
     "" "$op"
+  dg-runtest [lsort [glob -nocomplain 
$srcdir/$subdir/autovec/builtin/*.\[cS\]]] \
+    "" "$op"
}
# widening operation only test on LMUL < 8
-- 
2.41.0

Re: Re: [PATCH] RISC-V: Add rawmemchr expander.

Reply via email to