Could you put it into riscv-string.cc rather than riscv-v.cc? I would like to put those builtin function expander together if possible, riscv-string.cc might little bit confuse, but it's all included in string.h
On Fri, Oct 27, 2023 at 3:40 PM juzhe.zh...@rivai.ai <juzhe.zh...@rivai.ai> wrote: > > LGTM. Thanks. > > ________________________________ > juzhe.zh...@rivai.ai > > > From: Robin Dapp > Date: 2023-10-27 15:38 > To: 钟居哲; gcc-patches; palmer; kito.cheng; Jeff Law > CC: rdapp.gcc > Subject: Re: [PATCH] RISC-V: Add rawmemchr expander. > > Suggested adapt codes as follows: > > > > unsigned int element_size = GET_MODE_SIZE (mode).to_constant (); > > poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR *TARGET_MAX_LMUL, > > element_size); > > if (!get_vector_mode(mode, nunits).exists(&vmode)) > > gcc_unreachable (); > > Actually I was initially considering using lmul = m8 here, > unconditionally, but the param is probably the more intuitive choice. > > Attached v2 with that included. Also moved the riscv test to > autovec/builtin/ so we can add the other builtins as well. > > > Also, this patch reminds me we are missing some more similiar builtin > > function which can use RVV: > > > > strlen, strcpy, strcmp...etc > > Yes we should still have them but I'd rather not work on that right > now. How about I open a PR for it so we can still add them in stage 3? > Their impact is pretty localized and the risk should be low. > Kito, Palmer, Jeff - would that be acceptable? > > Regards > Robin > > gcc/ChangeLog: > > * config/riscv/autovec.md (rawmemchr<ANYI:mode>): New expander. > * config/riscv/riscv-protos.h (enum insn_type): Define. > (expand_rawmemchr): New function. > * config/riscv/riscv-v.cc (expand_rawmemchr): Add vectorized > rawmemchr. > * internal-fn.cc (expand_RAWMEMCHR): Fix typo. > > gcc/testsuite/ChangeLog: > > * gcc.dg/tree-ssa/ldist-rawmemchr-1.c: Add riscv. > * gcc.dg/tree-ssa/ldist-rawmemchr-2.c: Ditto. > * gcc.target/riscv/rvv/rvv.exp: Add builtin directory. > * gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c: New test. > --- > gcc/config/riscv/autovec.md | 13 +++ > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 89 +++++++++++++++++ > gcc/internal-fn.cc | 2 +- > .../gcc.dg/tree-ssa/ldist-rawmemchr-1.c | 8 +- > .../gcc.dg/tree-ssa/ldist-rawmemchr-2.c | 8 +- > .../riscv/rvv/autovec/builtin/rawmemchr-1.c | 99 +++++++++++++++++++ > gcc/testsuite/gcc.target/riscv/rvv/rvv.exp | 2 + > 8 files changed, 213 insertions(+), 9 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index 1ddc1993120..4f13494afdb 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -2397,3 +2397,16 @@ (define_expand "lfloor<mode><v_i_l_ll_convert>2" > DONE; > } > ) > + > +;; Implement rawmemchr[qi|si|hi]. > +(define_expand "rawmemchr<ANYI:mode>" > + [(match_operand 0 "register_operand") > + (match_operand 1 "memory_operand") > + (match_operand:ANYI 2 "const_int_operand")] > + "TARGET_VECTOR" > + { > + riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1], > + operands[2]); > + DONE; > + } > +) > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 843a81b0e86..7f148ed95fe 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -526,6 +526,7 @@ void expand_cond_unop (unsigned, rtx *); > void expand_cond_binop (unsigned, rtx *); > void expand_cond_ternop (unsigned, rtx *); > void expand_popcount (rtx *); > +void expand_rawmemchr (machine_mode, rtx, rtx, rtx); > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index 3fe8125801b..0f664553cf4 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -2215,6 +2215,95 @@ expand_block_move (rtx dst_in, rtx src_in, rtx > length_in) > return true; > } > +/* Implement rawmemchr<mode> using vector instructions. > + It can be assumed that the needle is in the haystack, otherwise the > + behavior is undefined. */ > + > +void > +expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat) > +{ > + /* > + rawmemchr: > + loop: > + vsetvli a1, zero, e[8,16,32,64], m1, ta, ma > + vle[8,16,32,64]ff.v v8, (a0) # Load. > + csrr a1, vl # Get number of bytes read. > + vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...}) > + vfirst.m a2, v0 # Find first hit. > + add a0, a0, a1 # Bump pointer. > + bltz a2, loop # Not found? > + > + sub a0, a0, a1 # Go back by a1. > + shll a2, a2, [0,1,2,3] # Shift to get byte offset. > + add a0, a0, a2 # Add the offset. > + > + ret > + */ > + gcc_assert (TARGET_VECTOR); > + > + unsigned int isize = GET_MODE_SIZE (mode).to_constant (); > + int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul; > + poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); > + > + machine_mode vmode; > + if (!get_vector_mode (GET_MODE_INNER (mode), nunits).exists (&vmode)) > + gcc_unreachable (); > + > + machine_mode mask_mode = get_mask_mode (vmode); > + > + rtx cnt = gen_reg_rtx (Pmode); > + rtx end = gen_reg_rtx (Pmode); > + rtx vec = gen_reg_rtx (vmode); > + rtx mask = gen_reg_rtx (mask_mode); > + > + /* After finding the first vector element matching the needle, we > + need to multiply by the vector element width (SEW) in order to > + return a pointer to the matching byte. */ > + unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ()); > + > + rtx src_addr = copy_addr_to_reg (XEXP (src, 0)); > + > + rtx loop = gen_label_rtx (); > + emit_label (loop); > + > + rtx vsrc = change_address (src, vmode, src_addr); > + > + /* Emit a first-fault load. */ > + rtx vlops[] = {vec, vsrc}; > + emit_vlmax_insn (code_for_pred_fault_load (vmode), UNARY_OP, vlops); > + > + /* Read how far we read. */ > + if (Pmode == SImode) > + emit_insn (gen_read_vlsi (cnt)); > + else > + emit_insn (gen_read_vldi_zero_extend (cnt)); > + > + /* Compare needle with haystack and store in a mask. */ > + rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec); > + rtx vmsops[] = {mask, eq, vec, pat}; > + emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), COMPARE_OP, vmsops, > + cnt); > + > + /* Find the first bit in the mask. */ > + rtx vfops[] = {end, mask}; > + emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), > + CPOP_OP, vfops, cnt); > + > + /* Bump the pointer. */ > + emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); > + > + /* Emit the loop condition. */ > + rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx); > + emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop)); > + > + /* We overran by CNT, subtract it. */ > + emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt))); > + > + /* We found something at SRC + END * [1,2,4,8]. */ > + emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT > (shift)))); > + emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); > +} > + > /* Return the vectorization machine mode for RVV according to LMUL. */ > machine_mode > preferred_simd_mode (scalar_mode mode) > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index ed83fa8112e..adf84f20a44 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -3242,7 +3242,7 @@ expand_VEC_CONVERT (internal_fn, gcall *) > gcc_unreachable (); > } > -/* Expand IFN_RAWMEMCHAR internal function. */ > +/* Expand IFN_RAWMEMCHR internal function. */ > void > expand_RAWMEMCHR (internal_fn, gcall *stmt) > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c > b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c > index bf6335f6360..adf53b10def 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c > @@ -1,9 +1,9 @@ > -/* { dg-do run { target s390x-*-* } } */ > +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */ > /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ > /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { > target s390x-*-* } } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { > target s390x-*-* } } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > /* Rawmemchr pattern: reduction stmt and no store */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c > b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c > index 83f5a35a322..6c8a485a3aa 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c > @@ -1,9 +1,9 @@ > -/* { dg-do run { target s390x-*-* } } */ > +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */ > /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ > /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { > target s390x-*-* } } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { > target s390x-*-* } } } */ > -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { > target s390x-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { > target { { s390x-*-* } || { riscv_v } } } } } */ > /* Rawmemchr pattern: reduction stmt and store */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c > new file mode 100644 > index 00000000000..ba83cb3836f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c > @@ -0,0 +1,99 @@ > +/* { dg-do run { target { riscv_v } } } */ > +/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution > -fdump-tree-ldist-details" } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */ > +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */ > + > +#include <string.h> > +#include <assert.h> > +#include <stdint.h> > +#include <stdlib.h> > + > +#define rawmemchrT(T, pattern) \ > +__attribute__((noinline,noclone)) \ > +T* rawmemchr_##T (T *s) \ > +{ \ > + while (*s != pattern) \ > + ++s; \ > + return s; \ > +} > + > +rawmemchrT(int8_t, (int8_t)0xde) > +rawmemchrT(uint8_t, 0xde) > +rawmemchrT(int16_t, (int16_t)0xdead) > +rawmemchrT(uint16_t, 0xdead) > +rawmemchrT(int32_t, (int32_t)0xdeadbeef) > +rawmemchrT(uint32_t, 0xdeadbeef) > + > +#define runT(T, pattern) \ > +void run_##T () \ > +{ \ > + T *buf = malloc (4096 * 2 * sizeof(T)); \ > + assert (buf != NULL); \ > + memset (buf, 0xa, 4096 * 2 * sizeof(T)); \ > + /* ensure q is 4096-byte aligned */ \ > + T *q = (T*)((unsigned char *)buf \ > + + (4096 - ((uintptr_t)buf & 4095))); \ > + T *p; \ > + /* unaligned + block boundary + 1st load */ \ > + p = (T *) ((uintptr_t)q - 8); \ > + p[2] = pattern; \ > + assert ((rawmemchr_##T (&p[0]) == &p[2])); \ > + p[2] = (T) 0xaaaaaaaa; \ > + /* unaligned + block boundary + 2nd load */ \ > + p = (T *) ((uintptr_t)q - 8); \ > + p[6] = pattern; \ > + assert ((rawmemchr_##T (&p[0]) == &p[6])); \ > + p[6] = (T) 0xaaaaaaaa; \ > + /* unaligned + 1st load */ \ > + q[5] = pattern; \ > + assert ((rawmemchr_##T (&q[2]) == &q[5])); \ > + q[5] = (T) 0xaaaaaaaa; \ > + /* unaligned + 2nd load */ \ > + q[14] = pattern; \ > + assert ((rawmemchr_##T (&q[2]) == &q[14])); \ > + q[14] = (T) 0xaaaaaaaa; \ > + /* unaligned + 3rd load */ \ > + q[19] = pattern; \ > + assert ((rawmemchr_##T (&q[2]) == &q[19])); \ > + q[19] = (T) 0xaaaaaaaa; \ > + /* unaligned + 4th load */ \ > + q[25] = pattern; \ > + assert ((rawmemchr_##T (&q[2]) == &q[25])); \ > + q[25] = (T) 0xaaaaaaaa; \ > + /* aligned + 1st load */ \ > + q[5] = pattern; \ > + assert ((rawmemchr_##T (&q[0]) == &q[5])); \ > + q[5] = (T) 0xaaaaaaaa; \ > + /* aligned + 2nd load */ \ > + q[14] = pattern; \ > + assert ((rawmemchr_##T (&q[0]) == &q[14])); \ > + q[14] = (T) 0xaaaaaaaa; \ > + /* aligned + 3rd load */ \ > + q[19] = pattern; \ > + assert ((rawmemchr_##T (&q[0]) == &q[19])); \ > + q[19] = (T) 0xaaaaaaaa; \ > + /* aligned + 4th load */ \ > + q[25] = pattern; \ > + assert ((rawmemchr_##T (&q[0]) == &q[25])); \ > + q[25] = (T) 0xaaaaaaaa; \ > + free (buf); \ > +} > + > +runT(int8_t, (int8_t)0xde) > +runT(uint8_t, 0xde) > +runT(int16_t, (int16_t)0xdead) > +runT(uint16_t, 0xdead) > +runT(int32_t, (int32_t)0xdeadbeef) > +runT(uint32_t, 0xdeadbeef) > + > +int main (void) > +{ > + run_uint8_t (); > + run_int8_t (); > + run_uint16_t (); > + run_int16_t (); > + run_uint32_t (); > + run_int32_t (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp > b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp > index b19aa7b4ae6..9f7a10d5b78 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp > +++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp > @@ -81,6 +81,8 @@ foreach op $AUTOVEC_TEST_OPTS { > "" "$op" > dg-runtest [lsort [glob -nocomplain > $srcdir/$subdir/autovec/cond/*.\[cS\]]] \ > "" "$op" > + dg-runtest [lsort [glob -nocomplain > $srcdir/$subdir/autovec/builtin/*.\[cS\]]] \ > + "" "$op" > } > # widening operation only test on LMUL < 8 > -- > 2.41.0 > >