It seems that you didn't commit it yet. A nit comment: + int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul; I change you could use TARGET_MAX_LMUL
https://gcc.gnu.org/git/?p=gcc.git;a=blobdiff;f=gcc/config/riscv/riscv-opts.h;h=532b1b6b84a0fff7d4755507cef32193276c0c3f;hp=e557f70f414b21fb1ac7535504653f896191238b;hb=446efa52a8cadb56d1d994da5c4de394efaff462;hpb=e37bc2cf00671e3bc4d82f2627330c0f885a6f29 juzhe.zh...@rivai.ai From: Robin Dapp Date: 2023-10-27 16:50 To: juzhe.zh...@rivai.ai; kito.cheng CC: rdapp.gcc; gcc-patches; palmer; jeffreyalaw Subject: Re: [PATCH] RISC-V: Add rawmemchr expander. Attached v3 that I'd commit. Regards Robin From 246b986a8ea2332ced7a094dd68d35d84dcbbc04 Mon Sep 17 00:00:00 2001 From: Robin Dapp <rd...@ventanamicro.com> Date: Tue, 24 Oct 2023 10:33:15 +0200 Subject: [PATCH v3] RISC-V: Add rawmemchr expander. This patch adds a vectorized rawmemchr expander. It also moves the vectorized expand_block_move to riscv-string.cc. gcc/ChangeLog: * config/riscv/autovec.md (rawmemchr<ANYI:mode>): New expander. * config/riscv/riscv-protos.h (gen_no_side_effects_vsetvl_rtx): Define. (expand_rawmemchr): Define. * config/riscv/riscv-v.cc (force_vector_length_operand): Remove static. (expand_block_move): Move from here... * config/riscv/riscv-string.cc (expand_block_move): ...to here. (expand_rawmemchr): Add vectorized expander. * internal-fn.cc (expand_RAWMEMCHR): Fix typo. gcc/testsuite/ChangeLog: * gcc.dg/tree-prof/peel-2.c: Add -fno-tree-loop-distribute-patterns. * gcc.dg/tree-ssa/ldist-rawmemchr-1.c: Add riscv. * gcc.dg/tree-ssa/ldist-rawmemchr-2.c: Ditto. * gcc.target/riscv/rvv/rvv.exp: Add builtin directory. * gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c: New test. --- gcc/config/riscv/autovec.md | 13 + gcc/config/riscv/riscv-protos.h | 2 + gcc/config/riscv/riscv-string.cc | 302 ++++++++++++++++++ gcc/config/riscv/riscv-v.cc | 202 +----------- gcc/internal-fn.cc | 2 +- gcc/testsuite/gcc.dg/tree-prof/peel-2.c | 2 +- .../gcc.dg/tree-ssa/ldist-rawmemchr-1.c | 8 +- .../gcc.dg/tree-ssa/ldist-rawmemchr-2.c | 8 +- .../riscv/rvv/autovec/builtin/rawmemchr-1.c | 99 ++++++ gcc/testsuite/gcc.target/riscv/rvv/rvv.exp | 2 + 10 files changed, 429 insertions(+), 211 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 1ddc1993120..4f13494afdb 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -2397,3 +2397,16 @@ (define_expand "lfloor<mode><v_i_l_ll_convert>2" DONE; } ) + +;; Implement rawmemchr[qi|si|hi]. +(define_expand "rawmemchr<ANYI:mode>" + [(match_operand 0 "register_operand") + (match_operand 1 "memory_operand") + (match_operand:ANYI 2 "const_int_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_rawmemchr(<MODE>mode, operands[0], operands[1], + operands[2]); + DONE; + } +) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 843a81b0e86..44189ec8139 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -495,6 +495,7 @@ void expand_vec_lfloor (rtx, rtx, machine_mode, machine_mode); bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode, bool, void (*)(rtx *, rtx)); rtx gen_scalar_move_mask (machine_mode); +rtx gen_no_side_effects_vsetvl_rtx (machine_mode, rtx, rtx); /* RVV vector register sizes. TODO: Currently, we only add RVV_32/RVV_64/RVV_128, we may need to @@ -526,6 +527,7 @@ void expand_cond_unop (unsigned, rtx *); void expand_cond_binop (unsigned, rtx *); void expand_cond_ternop (unsigned, rtx *); void expand_popcount (rtx *); +void expand_rawmemchr (machine_mode, rtx, rtx, rtx); /* Rounding mode bitfield for fixed point VXRM. */ enum fixed_point_rounding_mode diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 0b4606aa7b2..75c2acaf9b1 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -36,6 +36,9 @@ #include "target.h" #include "predict.h" #include "optabs.h" +#include "riscv-protos.h" +#include "recog.h" +#include "tm-constrs.h" /* Emit proper instruction depending on mode of dest. */ @@ -747,3 +750,302 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length) } return false; } + +/* --- Vector expanders --- */ + +namespace riscv_vector { + +/* Used by cpymemsi in riscv.md . */ + +bool +expand_block_move (rtx dst_in, rtx src_in, rtx length_in) +{ + /* + memcpy: + mv a3, a0 # Copy destination + loop: + vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b + vle8.v v0, (a1) # Load bytes + add a1, a1, t0 # Bump pointer + sub a2, a2, t0 # Decrement count + vse8.v v0, (a3) # Store bytes + add a3, a3, t0 # Bump pointer + bnez a2, loop # Any more? + ret # Return + */ + if (!TARGET_VECTOR) + return false; + HOST_WIDE_INT potential_ew + = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) + / BITS_PER_UNIT); + machine_mode vmode = VOIDmode; + bool need_loop = true; + bool size_p = optimize_function_for_size_p (cfun); + rtx src, dst; + rtx end = gen_reg_rtx (Pmode); + rtx vec; + rtx length_rtx = length_in; + + if (CONST_INT_P (length_in)) + { + HOST_WIDE_INT length = INTVAL (length_in); + + /* By using LMUL=8, we can copy as many bytes in one go as there + are bits in a vector register. If the entire block thus fits, + we don't need a loop. */ + if (length <= TARGET_MIN_VLEN) + { + need_loop = false; + + /* If a single scalar load / store pair can do the job, leave it + to the scalar code to do that. */ + /* ??? If fast unaligned access is supported, the scalar code could + use suitably sized scalars irrespective of alignemnt. If that + gets fixed, we have to adjust the test here. */ + + if (pow2p_hwi (length) && length <= potential_ew) + return false; + } + + /* Find the vector mode to use. Using the largest possible element + size is likely to give smaller constants, and thus potentially + reducing code size. However, if we need a loop, we need to update + the pointers, and that is more complicated with a larger element + size, unless we use an immediate, which prevents us from dynamically + using the targets transfer size that the hart supports. And then, + unless we know the *exact* vector size of the hart, we'd need + multiple vsetvli / branch statements, so it's not even a size win. + If, in the future, we find an RISCV-V implementation that is slower + for small element widths, we might allow larger element widths for + loops too. */ + if (need_loop) + potential_ew = 1; + for (; potential_ew; potential_ew >>= 1) + { + scalar_int_mode elem_mode; + unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; + unsigned HOST_WIDE_INT per_iter; + HOST_WIDE_INT nunits; + + if (need_loop) + per_iter = TARGET_MIN_VLEN; + else + per_iter = length; + nunits = per_iter / potential_ew; + + /* Unless we get an implementation that's slow for small element + size / non-word-aligned accesses, we assume that the hardware + handles this well, and we don't want to complicate the code + with shifting word contents around or handling extra bytes at + the start and/or end. So we want the total transfer size and + alignment to fit with the element size. */ + if (length % potential_ew != 0 + || !int_mode_for_size (bits, 0).exists (&elem_mode)) + continue; + /* Find the mode to use for the copy inside the loop - or the + sole copy, if there is no loop. */ + if (!need_loop) + { + /* Try if we have an exact mode for the copy. */ + if (riscv_vector::get_vector_mode (elem_mode, + nunits).exists (&vmode)) + break; + /* Since we don't have a mode that exactlty matches the transfer + size, we'll need to use pred_store, which is not available + for all vector modes, but only iE_RVV_M* modes, hence trying + to find a vector mode for a merely rounded-up size is + pointless. + Still, by choosing a lower LMUL factor that still allows + an entire transfer, we can reduce register pressure. */ + for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) + if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT + /* Avoid loosing the option of using vsetivli . */ + && (nunits <= 31 * lmul || nunits > 31 * 8) + && (riscv_vector::get_vector_mode + (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul, + potential_ew)).exists (&vmode))) + break; + } + + /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes + wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by + the sizes of larger element types; the LMUL factor of 8 can at + the moment be divided by the SEW, with SEW of up to 8 bytes, + but there are reserved encodings so there might be larger + SEW in the future. */ + if (riscv_vector::get_vector_mode + (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8, + potential_ew)).exists (&vmode)) + break; + + /* We may get here if we tried an element size that's larger than + the hardware supports, but we should at least find a suitable + byte vector mode. */ + gcc_assert (potential_ew > 1); + } + if (potential_ew > 1) + length_rtx = GEN_INT (length / potential_ew); + } + else + { + vmode = E_RVVM8QImode; + } + + /* A memcpy libcall in the worst case takes 3 instructions to prepare the + arguments + 1 for the call. When RVV should take 7 instructions and + we're optimizing for size a libcall may be preferable. */ + if (size_p && need_loop) + return false; + + /* length_rtx holds the (remaining) length of the required copy. + cnt holds the length we copy with the current load/store pair. */ + rtx cnt = length_rtx; + rtx label = NULL_RTX; + rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); + rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); + + if (need_loop) + { + length_rtx = copy_to_mode_reg (Pmode, length_rtx); + cnt = gen_reg_rtx (Pmode); + label = gen_label_rtx (); + + emit_label (label); + emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt, + length_rtx)); + } + + vec = gen_reg_rtx (vmode); + src = change_address (src_in, vmode, src_addr); + dst = change_address (dst_in, vmode, dst_addr); + + /* If we don't need a loop and have a suitable mode to describe the size, + just do a load / store pair and leave it up to the later lazy code + motion pass to insert the appropriate vsetvli. */ + if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) + { + emit_move_insn (vec, src); + emit_move_insn (dst, vec); + } + else + { + machine_mode mask_mode = riscv_vector::get_vector_mode + (BImode, GET_MODE_NUNITS (vmode)).require (); + rtx mask = CONSTM1_RTX (mask_mode); + if (!satisfies_constraint_K (cnt)) + cnt= force_reg (Pmode, cnt); + rtx m_ops[] = {vec, mask, src}; + emit_nonvlmax_insn (code_for_pred_mov (vmode), + riscv_vector::UNARY_OP_TAMA, m_ops, cnt); + emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt, + get_avl_type_rtx (riscv_vector::NONVLMAX))); + } + + if (need_loop) + { + emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); + emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); + emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt))); + + /* Emit the loop condition. */ + rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx); + emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label)); + emit_insn (gen_nop ()); + } + + return true; +} + + +/* Implement rawmemchr<mode> using vector instructions. + It can be assumed that the needle is in the haystack, otherwise the + behavior is undefined. */ + +void +expand_rawmemchr (machine_mode mode, rtx dst, rtx src, rtx pat) +{ + /* + rawmemchr: + loop: + vsetvli a1, zero, e[8,16,32,64], m1, ta, ma + vle[8,16,32,64]ff.v v8, (a0) # Load. + csrr a1, vl # Get number of bytes read. + vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...}) + vfirst.m a2, v0 # Find first hit. + add a0, a0, a1 # Bump pointer. + bltz a2, loop # Not found? + + sub a0, a0, a1 # Go back by a1. + shll a2, a2, [0,1,2,3] # Shift to get byte offset. + add a0, a0, a2 # Add the offset. + + ret + */ + gcc_assert (TARGET_VECTOR); + + unsigned int isize = GET_MODE_SIZE (mode).to_constant (); + int lmul = riscv_autovec_lmul == RVV_DYNAMIC ? RVV_M8 : riscv_autovec_lmul; + poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize); + + machine_mode vmode; + if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), + nunits).exists (&vmode)) + gcc_unreachable (); + + machine_mode mask_mode = riscv_vector::get_mask_mode (vmode); + + rtx cnt = gen_reg_rtx (Pmode); + rtx end = gen_reg_rtx (Pmode); + rtx vec = gen_reg_rtx (vmode); + rtx mask = gen_reg_rtx (mask_mode); + + /* After finding the first vector element matching the needle, we + need to multiply by the vector element width (SEW) in order to + return a pointer to the matching byte. */ + unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ()); + + rtx src_addr = copy_addr_to_reg (XEXP (src, 0)); + + rtx loop = gen_label_rtx (); + emit_label (loop); + + rtx vsrc = change_address (src, vmode, src_addr); + + /* Emit a first-fault load. */ + rtx vlops[] = {vec, vsrc}; + emit_vlmax_insn (code_for_pred_fault_load (vmode), + riscv_vector::UNARY_OP, vlops); + + /* Read how far we read. */ + if (Pmode == SImode) + emit_insn (gen_read_vlsi (cnt)); + else + emit_insn (gen_read_vldi_zero_extend (cnt)); + + /* Compare needle with haystack and store in a mask. */ + rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, pat), vec); + rtx vmsops[] = {mask, eq, vec, pat}; + emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode), + riscv_vector::COMPARE_OP, vmsops, cnt); + + /* Find the first bit in the mask. */ + rtx vfops[] = {end, mask}; + emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode), + riscv_vector::CPOP_OP, vfops, cnt); + + /* Bump the pointer. */ + emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); + + /* Emit the loop condition. */ + rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx); + emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop)); + + /* We overran by CNT, subtract it. */ + emit_insn (gen_rtx_SET (src_addr, gen_rtx_MINUS (Pmode, src_addr, cnt))); + + /* We found something at SRC + END * [1,2,4,8]. */ + emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift)))); + emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end))); +} + +} diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 3fe8125801b..4374afe6765 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -1725,7 +1725,7 @@ force_vector_length_operand (rtx vl) return vl; } -static rtx +rtx gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl) { unsigned int sew = get_sew (vmode); @@ -2015,206 +2015,6 @@ expand_tuple_move (rtx *ops) } } -/* Used by cpymemsi in riscv.md . */ - -bool -expand_block_move (rtx dst_in, rtx src_in, rtx length_in) -{ - /* - memcpy: - mv a3, a0 # Copy destination - loop: - vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b - vle8.v v0, (a1) # Load bytes - add a1, a1, t0 # Bump pointer - sub a2, a2, t0 # Decrement count - vse8.v v0, (a3) # Store bytes - add a3, a3, t0 # Bump pointer - bnez a2, loop # Any more? - ret # Return - */ - if (!TARGET_VECTOR) - return false; - HOST_WIDE_INT potential_ew - = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) - / BITS_PER_UNIT); - machine_mode vmode = VOIDmode; - bool need_loop = true; - bool size_p = optimize_function_for_size_p (cfun); - rtx src, dst; - rtx end = gen_reg_rtx (Pmode); - rtx vec; - rtx length_rtx = length_in; - - if (CONST_INT_P (length_in)) - { - HOST_WIDE_INT length = INTVAL (length_in); - - /* By using LMUL=8, we can copy as many bytes in one go as there - are bits in a vector register. If the entire block thus fits, - we don't need a loop. */ - if (length <= TARGET_MIN_VLEN) - { - need_loop = false; - - /* If a single scalar load / store pair can do the job, leave it - to the scalar code to do that. */ - /* ??? If fast unaligned access is supported, the scalar code could - use suitably sized scalars irrespective of alignemnt. If that - gets fixed, we have to adjust the test here. */ - - if (pow2p_hwi (length) && length <= potential_ew) - return false; - } - - /* Find the vector mode to use. Using the largest possible element - size is likely to give smaller constants, and thus potentially - reducing code size. However, if we need a loop, we need to update - the pointers, and that is more complicated with a larger element - size, unless we use an immediate, which prevents us from dynamically - using the targets transfer size that the hart supports. And then, - unless we know the *exact* vector size of the hart, we'd need - multiple vsetvli / branch statements, so it's not even a size win. - If, in the future, we find an RISCV-V implementation that is slower - for small element widths, we might allow larger element widths for - loops too. */ - if (need_loop) - potential_ew = 1; - for (; potential_ew; potential_ew >>= 1) - { - scalar_int_mode elem_mode; - unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; - unsigned HOST_WIDE_INT per_iter; - HOST_WIDE_INT nunits; - - if (need_loop) - per_iter = TARGET_MIN_VLEN; - else - per_iter = length; - nunits = per_iter / potential_ew; - - /* Unless we get an implementation that's slow for small element - size / non-word-aligned accesses, we assume that the hardware - handles this well, and we don't want to complicate the code - with shifting word contents around or handling extra bytes at - the start and/or end. So we want the total transfer size and - alignment to fit with the element size. */ - if (length % potential_ew != 0 - || !int_mode_for_size (bits, 0).exists (&elem_mode)) - continue; - /* Find the mode to use for the copy inside the loop - or the - sole copy, if there is no loop. */ - if (!need_loop) - { - /* Try if we have an exact mode for the copy. */ - if (get_vector_mode (elem_mode, nunits).exists (&vmode)) - break; - /* Since we don't have a mode that exactlty matches the transfer - size, we'll need to use pred_store, which is not available - for all vector modes, but only iE_RVV_M* modes, hence trying - to find a vector mode for a merely rounded-up size is - pointless. - Still, by choosing a lower LMUL factor that still allows - an entire transfer, we can reduce register pressure. */ - for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) - if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT - /* Avoid loosing the option of using vsetivli . */ - && (nunits <= 31 * lmul || nunits > 31 * 8) - && (get_vector_mode - (elem_mode, - exact_div (BYTES_PER_RISCV_VECTOR * lmul, - potential_ew) - ).exists (&vmode))) - break; - } - - /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes - wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by - the sizes of larger element types; the LMUL factor of 8 can at - the moment be divided by the SEW, with SEW of up to 8 bytes, - but there are reserved encodings so there might be larger - SEW in the future. */ - if (get_vector_mode (elem_mode, - exact_div (BYTES_PER_RISCV_VECTOR * 8, - potential_ew)).exists (&vmode)) - break; - - /* We may get here if we tried an element size that's larger than - the hardware supports, but we should at least find a suitable - byte vector mode. */ - gcc_assert (potential_ew > 1); - } - if (potential_ew > 1) - length_rtx = GEN_INT (length / potential_ew); - } - else - { - vmode = E_RVVM8QImode; - } - - /* A memcpy libcall in the worst case takes 3 instructions to prepare the - arguments + 1 for the call. When RVV should take 7 instructions and - we're optimizing for size a libcall may be preferable. */ - if (size_p && need_loop) - return false; - - /* length_rtx holds the (remaining) length of the required copy. - cnt holds the length we copy with the current load/store pair. */ - rtx cnt = length_rtx; - rtx label = NULL_RTX; - rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); - rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); - - if (need_loop) - { - length_rtx = copy_to_mode_reg (Pmode, length_rtx); - cnt = gen_reg_rtx (Pmode); - label = gen_label_rtx (); - - emit_label (label); - emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx)); - } - - vec = gen_reg_rtx (vmode); - src = change_address (src_in, vmode, src_addr); - dst = change_address (dst_in, vmode, dst_addr); - - /* If we don't need a loop and have a suitable mode to describe the size, - just do a load / store pair and leave it up to the later lazy code - motion pass to insert the appropriate vsetvli. */ - if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) - { - emit_move_insn (vec, src); - emit_move_insn (dst, vec); - } - else - { - machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require (); - rtx mask = CONSTM1_RTX (mask_mode); - if (!satisfies_constraint_K (cnt)) - cnt= force_reg (Pmode, cnt); - rtx m_ops[] = {vec, mask, src}; - emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA, - m_ops, cnt); - emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt, - get_avl_type_rtx (NONVLMAX))); - } - - if (need_loop) - { - emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); - emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); - emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt))); - - /* Emit the loop condition. */ - rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx); - emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label)); - emit_insn (gen_nop ()); - } - - return true; -} - /* Return the vectorization machine mode for RVV according to LMUL. */ machine_mode preferred_simd_mode (scalar_mode mode) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index ed83fa8112e..adf84f20a44 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -3242,7 +3242,7 @@ expand_VEC_CONVERT (internal_fn, gcall *) gcc_unreachable (); } -/* Expand IFN_RAWMEMCHAR internal function. */ +/* Expand IFN_RAWMEMCHR internal function. */ void expand_RAWMEMCHR (internal_fn, gcall *stmt) diff --git a/gcc/testsuite/gcc.dg/tree-prof/peel-2.c b/gcc/testsuite/gcc.dg/tree-prof/peel-2.c index ac417fb3b57..216e6552a58 100644 --- a/gcc/testsuite/gcc.dg/tree-prof/peel-2.c +++ b/gcc/testsuite/gcc.dg/tree-prof/peel-2.c @@ -1,4 +1,4 @@ -/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops -fdump-tree-ch2-details-blocks" } */ +/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops -fpeel-loops -fdump-tree-ch2-details-blocks -fno-tree-loop-distribute-patterns" } */ int a[100]; int n = 1000000; int zeroc; diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c index bf6335f6360..adf53b10def 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c @@ -1,9 +1,9 @@ -/* { dg-do run { target s390x-*-* } } */ +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */ /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ /* Rawmemchr pattern: reduction stmt and no store */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c index 83f5a35a322..6c8a485a3aa 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c @@ -1,9 +1,9 @@ -/* { dg-do run { target s390x-*-* } } */ +/* { dg-do run { target { { s390x-*-* } || { riscv_v } } } } */ /* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ /* { dg-additional-options "-march=z13 -mzarch" { target s390x-*-* } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */ -/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target { { s390x-*-* } || { riscv_v } } } } } */ /* Rawmemchr pattern: reduction stmt and store */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c new file mode 100644 index 00000000000..ba83cb3836f --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/builtin/rawmemchr-1.c @@ -0,0 +1,99 @@ +/* { dg-do run { target { riscv_v } } } */ +/* { dg-additional-options "-std=gnu99 -O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */ + +#include <string.h> +#include <assert.h> +#include <stdint.h> +#include <stdlib.h> + +#define rawmemchrT(T, pattern) \ +__attribute__((noinline,noclone)) \ +T* rawmemchr_##T (T *s) \ +{ \ + while (*s != pattern) \ + ++s; \ + return s; \ +} + +rawmemchrT(int8_t, (int8_t)0xde) +rawmemchrT(uint8_t, 0xde) +rawmemchrT(int16_t, (int16_t)0xdead) +rawmemchrT(uint16_t, 0xdead) +rawmemchrT(int32_t, (int32_t)0xdeadbeef) +rawmemchrT(uint32_t, 0xdeadbeef) + +#define runT(T, pattern) \ +void run_##T () \ +{ \ + T *buf = malloc (4096 * 2 * sizeof(T)); \ + assert (buf != NULL); \ + memset (buf, 0xa, 4096 * 2 * sizeof(T)); \ + /* ensure q is 4096-byte aligned */ \ + T *q = (T*)((unsigned char *)buf \ + + (4096 - ((uintptr_t)buf & 4095))); \ + T *p; \ + /* unaligned + block boundary + 1st load */ \ + p = (T *) ((uintptr_t)q - 8); \ + p[2] = pattern; \ + assert ((rawmemchr_##T (&p[0]) == &p[2])); \ + p[2] = (T) 0xaaaaaaaa; \ + /* unaligned + block boundary + 2nd load */ \ + p = (T *) ((uintptr_t)q - 8); \ + p[6] = pattern; \ + assert ((rawmemchr_##T (&p[0]) == &p[6])); \ + p[6] = (T) 0xaaaaaaaa; \ + /* unaligned + 1st load */ \ + q[5] = pattern; \ + assert ((rawmemchr_##T (&q[2]) == &q[5])); \ + q[5] = (T) 0xaaaaaaaa; \ + /* unaligned + 2nd load */ \ + q[14] = pattern; \ + assert ((rawmemchr_##T (&q[2]) == &q[14])); \ + q[14] = (T) 0xaaaaaaaa; \ + /* unaligned + 3rd load */ \ + q[19] = pattern; \ + assert ((rawmemchr_##T (&q[2]) == &q[19])); \ + q[19] = (T) 0xaaaaaaaa; \ + /* unaligned + 4th load */ \ + q[25] = pattern; \ + assert ((rawmemchr_##T (&q[2]) == &q[25])); \ + q[25] = (T) 0xaaaaaaaa; \ + /* aligned + 1st load */ \ + q[5] = pattern; \ + assert ((rawmemchr_##T (&q[0]) == &q[5])); \ + q[5] = (T) 0xaaaaaaaa; \ + /* aligned + 2nd load */ \ + q[14] = pattern; \ + assert ((rawmemchr_##T (&q[0]) == &q[14])); \ + q[14] = (T) 0xaaaaaaaa; \ + /* aligned + 3rd load */ \ + q[19] = pattern; \ + assert ((rawmemchr_##T (&q[0]) == &q[19])); \ + q[19] = (T) 0xaaaaaaaa; \ + /* aligned + 4th load */ \ + q[25] = pattern; \ + assert ((rawmemchr_##T (&q[0]) == &q[25])); \ + q[25] = (T) 0xaaaaaaaa; \ + free (buf); \ +} + +runT(int8_t, (int8_t)0xde) +runT(uint8_t, 0xde) +runT(int16_t, (int16_t)0xdead) +runT(uint16_t, 0xdead) +runT(int32_t, (int32_t)0xdeadbeef) +runT(uint32_t, 0xdeadbeef) + +int main (void) +{ + run_uint8_t (); + run_int8_t (); + run_uint16_t (); + run_int16_t (); + run_uint32_t (); + run_int32_t (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp index b19aa7b4ae6..9f7a10d5b78 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp +++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp @@ -81,6 +81,8 @@ foreach op $AUTOVEC_TEST_OPTS { "" "$op" dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/cond/*.\[cS\]]] \ "" "$op" + dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/builtin/*.\[cS\]]] \ + "" "$op" } # widening operation only test on LMUL < 8 -- 2.41.0