Could you put it into riscv-string.cc rather than riscv-v.cc? I would
like to put those builtin function expander together if possible,
riscv-string.cc might little bit confuse, but it's all included in
string.h

On Fri, Oct 27, 2023 at 3:40 PM juzhe.zh...@rivai.ai
<juzhe.zh...@rivai.ai> wrote:
>
> LGTM. Thanks.
>
> ________________________________
> juzhe.zh...@rivai.ai
>
>
> From: Robin Dapp
> Date: 2023-10-27 15:38
> To: 钟居哲; gcc-patches; palmer; kito.cheng; Jeff Law
> CC: rdapp.gcc
> Subject: Re: [PATCH] RISC-V: Add rawmemchr expander.
> > Suggested adapt codes as follows:
> >
> > unsigned int element_size = GET_MODE_SIZE (mode).to_constant ();
> > poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR *TARGET_MAX_LMUL, 
> > element_size);
> > if (!get_vector_mode(mode, nunits).exists(&vmode))
> >   gcc_unreachable ();
>
> Actually I was initially considering using lmul = m8 here,
> unconditionally, but the param is probably the more intuitive choice.
>
> Attached v2 with that included.  Also moved the riscv test to
> autovec/builtin/ so we can add the other builtins as well.
>
> > Also, this patch reminds me we are missing some more similiar builtin
> > function which can use RVV:
> >
> > strlen, strcpy, strcmp...etc
>
> Yes we should still have them but I'd rather not work on that right
> now.  How about I open a PR for it so we can still add them in stage 3?
> Their impact is pretty localized and the risk should be low.
> Kito, Palmer, Jeff - would that be acceptable?
>
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md (rawmemchr<ANYI:mode>): New expander.
> * config/riscv/riscv-protos.h (enum insn_type): Define.
> (expand_rawmemchr): New function.
> * config/riscv/riscv-v.cc (expand_rawmemchr): Add vectorized
> rawmemchr.
> * internal-fn.cc (expand_RAWMEMCHR): Fix typo.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/ldist-rawmemchr-1.c: Add riscv.
> * gcc.dg/tree-ssa/ldist-rawmemchr-2.c: Ditto.
> * gcc.target/riscv/rvv/rvv.exp: Add builtin directory.
> * gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c: New test.
> ---
> gcc/config/riscv/autovec.md                   | 13 +++
> gcc/config/riscv/riscv-protos.h               |  1 +
> gcc/config/riscv/riscv-v.cc                   | 89 +++++++++++++++++
> gcc/internal-fn.cc                            |  2 +-
> .../gcc.dg/tree-ssa/ldist-rawmemchr-1.c       |  8 +-
> .../gcc.dg/tree-ssa/ldist-rawmemchr-2.c       |  8 +-
> .../riscv/rvv/autovec/builtin/rawmemchr-1.c   | 99 +++++++++++++++++++
> gcc/testsuite/gcc.target/riscv/rvv/rvv.exp    |  2 +
> 8 files changed, 213 insertions(+), 9 deletions(-)
> create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 1ddc1993120..4f13494afdb 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -2397,3 +2397,16 @@ (define_expand "lfloor<mode><v_i_l_ll_convert>2"
>      DONE;
>    }
> )
> +
> +;; Implement rawmemchr[qi|si|hi].
> +(define_expand "rawmemchr<ANYI:mode>"
> +  [(match_operand      0 "register_operand")
> +   (match_operand      1 "memory_operand")
> +   (match_operand:ANYI 2 "const_int_operand")]
> +  "TARGET_VECTOR"
> +  {
> +    riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1],
> +    operands[2]);
> +    DONE;
> +  }
> +)
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 843a81b0e86..7f148ed95fe 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -526,6 +526,7 @@ void expand_cond_unop (unsigned, rtx *);
> void expand_cond_binop (unsigned, rtx *);
> void expand_cond_ternop (unsigned, rtx *);
> void expand_popcount (rtx *);
> +void expand_rawmemchr (machine_mode, rtx, rtx, rtx);
> /* Rounding mode bitfield for fixed point VXRM.  */
> enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 3fe8125801b..0f664553cf4 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -2215,6 +2215,95 @@ expand_block_move (rtx dst_in, rtx src_in, rtx 
> length_in)
>    return true;
> }
> +/* Implement rawmemchr<mode> using vector instructions.
> +   It can be assumed that the needle is in the haystack, otherwise the
> +   behavior is undefined.  */
> +
> +void
> +expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat)
> +{
> +  /*
> +    rawmemchr:
> +    loop:
> +       vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
> +       vle[8,16,32,64]ff.v v8, (a0)  # Load.
> +       csrr a1, vl      # Get number of bytes read.
> +       vmseq.vx v0, v8, pat      # v0 = (v8 == {pat, pat, ...})
> +       vfirst.m a2, v0      # Find first hit.
> +       add a0, a0, a1      # Bump pointer.
> +       bltz a2, loop      # Not found?
> +
> +       sub a0, a0, a1      # Go back by a1.
> +       shll a2, a2, [0,1,2,3]      # Shift to get byte offset.
> +       add a0, a0, a2      # Add the offset.
> +
> +       ret
> +  */
> +  gcc_assert (TARGET_VECTOR);
> +
> +  unsigned int isize = GET_MODE_SIZE (mode).to_constant ();
> +  int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul;
> +  poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize);
> +
> +  machine_mode vmode;
> +  if (!get_vector_mode (GET_MODE_INNER (mode), nunits).exists (&vmode))
> +    gcc_unreachable ();
> +
> +  machine_mode mask_mode = get_mask_mode (vmode);
> +
> +  rtx cnt = gen_reg_rtx (Pmode);
> +  rtx end = gen_reg_rtx (Pmode);
> +  rtx vec = gen_reg_rtx (vmode);
> +  rtx mask = gen_reg_rtx (mask_mode);
> +
> +  /* After finding the first vector element matching the needle, we
> +     need to multiply by the vector element width (SEW) in order to
> +     return a pointer to the matching byte.  */
> +  unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ());
> +
> +  rtx src_addr = copy_addr_to_reg (XEXP (src, 0));
> +
> +  rtx loop = gen_label_rtx ();
> +  emit_label (loop);
> +
> +  rtx vsrc = change_address (src, vmode, src_addr);
> +
> +  /* Emit a first-fault load.  */
> +  rtx vlops[] = {vec, vsrc};
> +  emit_vlmax_insn (code_for_pred_fault_load (vmode), UNARY_OP, vlops);
> +
> +  /* Read how far we read.  */
> +  if (Pmode == SImode)
> +    emit_insn (gen_read_vlsi (cnt));
> +  else
> +    emit_insn (gen_read_vldi_zero_extend (cnt));
> +
> +  /* Compare needle with haystack and store in a mask.  */
> +  rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec);
> +  rtx vmsops[] = {mask, eq, vec, pat};
> +  emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), COMPARE_OP, vmsops,
> +       cnt);
> +
> +  /* Find the first bit in the mask.  */
> +  rtx vfops[] = {end, mask};
> +  emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
> +       CPOP_OP, vfops, cnt);
> +
> +  /* Bump the pointer.  */
> +  emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
> +
> +  /* Emit the loop condition.  */
> +  rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx);
> +  emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop));
> +
> +  /*  We overran by CNT, subtract it.  */
> +  emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt)));
> +
> +  /*  We found something at SRC + END * [1,2,4,8].  */
> +  emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT 
> (shift))));
> +  emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
> +}
> +
> /* Return the vectorization machine mode for RVV according to LMUL.  */
> machine_mode
> preferred_simd_mode (scalar_mode mode)
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index ed83fa8112e..adf84f20a44 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -3242,7 +3242,7 @@ expand_VEC_CONVERT (internal_fn, gcall *)
>    gcc_unreachable ();
> }
> -/* Expand IFN_RAWMEMCHAR internal function.  */
> +/* Expand IFN_RAWMEMCHR internal function.  */
> void
> expand_RAWMEMCHR (internal_fn, gcall *stmt)
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
> index bf6335f6360..adf53b10def 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c
> @@ -1,9 +1,9 @@
> -/* { dg-do run { target s390x-*-* } } */
> +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
> /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
> /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
> target s390x-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
> target s390x-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
> target s390x-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> /* Rawmemchr pattern: reduction stmt and no store */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
> index 83f5a35a322..6c8a485a3aa 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c
> @@ -1,9 +1,9 @@
> -/* { dg-do run { target s390x-*-* } } */
> +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */
> /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */
> /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
> target s390x-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
> target s390x-*-* } } } */
> -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
> target s390x-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { 
> target { { s390x-*-* } || { riscv_v } } } } } */
> /* Rawmemchr pattern: reduction stmt and store */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
> new file mode 100644
> index 00000000000..ba83cb3836f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution 
> -fdump-tree-ldist-details" } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
> +
> +#include <string.h>
> +#include <assert.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +
> +#define rawmemchrT(T, pattern)     \
> +__attribute__((noinline,noclone))  \
> +T* rawmemchr_##T (T *s)            \
> +{                                  \
> +  while (*s != pattern)            \
> +    ++s;                           \
> +  return s;                        \
> +}
> +
> +rawmemchrT(int8_t, (int8_t)0xde)
> +rawmemchrT(uint8_t, 0xde)
> +rawmemchrT(int16_t, (int16_t)0xdead)
> +rawmemchrT(uint16_t, 0xdead)
> +rawmemchrT(int32_t, (int32_t)0xdeadbeef)
> +rawmemchrT(uint32_t, 0xdeadbeef)
> +
> +#define runT(T, pattern)                           \
> +void run_##T ()                                    \
> +{                                                  \
> +  T *buf = malloc (4096 * 2 * sizeof(T));          \
> +  assert (buf != NULL);                            \
> +  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
> +  /* ensure q is 4096-byte aligned */              \
> +  T *q = (T*)((unsigned char *)buf                 \
> +              + (4096 - ((uintptr_t)buf & 4095))); \
> +  T *p;                                            \
> +  /* unaligned + block boundary + 1st load */      \
> +  p = (T *) ((uintptr_t)q - 8);                    \
> +  p[2] = pattern;                                  \
> +  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
> +  p[2] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + block boundary + 2nd load */      \
> +  p = (T *) ((uintptr_t)q - 8);                    \
> +  p[6] = pattern;                                  \
> +  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
> +  p[6] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + 1st load */                       \
> +  q[5] = pattern;                                  \
> +  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
> +  q[5] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + 2nd load */                       \
> +  q[14] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
> +  q[14] = (T) 0xaaaaaaaa;                          \
> +  /* unaligned + 3rd load */                       \
> +  q[19] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
> +  q[19] = (T) 0xaaaaaaaa;                          \
> +  /* unaligned + 4th load */                       \
> +  q[25] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
> +  q[25] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 1st load */                         \
> +  q[5] = pattern;                                  \
> +  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
> +  q[5] = (T) 0xaaaaaaaa;                           \
> +  /* aligned + 2nd load */                         \
> +  q[14] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
> +  q[14] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 3rd load */                         \
> +  q[19] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
> +  q[19] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 4th load */                         \
> +  q[25] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
> +  q[25] = (T) 0xaaaaaaaa;                          \
> +  free (buf);                                      \
> +}
> +
> +runT(int8_t, (int8_t)0xde)
> +runT(uint8_t, 0xde)
> +runT(int16_t, (int16_t)0xdead)
> +runT(uint16_t, 0xdead)
> +runT(int32_t, (int32_t)0xdeadbeef)
> +runT(uint32_t, 0xdeadbeef)
> +
> +int main (void)
> +{
> +  run_uint8_t ();
> +  run_int8_t ();
> +  run_uint16_t ();
> +  run_int16_t ();
> +  run_uint32_t ();
> +  run_int32_t ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp 
> b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> index b19aa7b4ae6..9f7a10d5b78 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> @@ -81,6 +81,8 @@ foreach op $AUTOVEC_TEST_OPTS {
>      "" "$op"
>    dg-runtest [lsort [glob -nocomplain 
> $srcdir/$subdir/autovec/cond/*.\[cS\]]] \
>      "" "$op"
> +  dg-runtest [lsort [glob -nocomplain 
> $srcdir/$subdir/autovec/builtin/*.\[cS\]]] \
> +    "" "$op"
> }
> # widening operation only test on LMUL < 8
> --
> 2.41.0
>
>

Reply via email to