`expand_vec_setmem` only generated vectorized memset if it fitted into a single vector store. Extend it to generate a loop for longer and unknown lengths.
The test cases now use -O1 so that they are not sensitive to scheduling. gcc/ChangeLog: * config/riscv/riscv-string.cc (use_vector_stringop_p): Add comment. (expand_vec_setmem): Use use_vector_stringop_p instead of check_vectorise_memory_operation. Add loop generation. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/setmem-1.c: Use -O1. Expect a loop instead of a libcall. Add test for unknown length. * gcc.target/riscv/rvv/base/setmem-2.c: Likewise. * gcc.target/riscv/rvv/base/setmem-3.c: Likewise and expect smaller lmul. --- gcc/config/riscv/riscv-string.cc | 83 ++++++++++++++----- .../gcc.target/riscv/rvv/base/setmem-1.c | 37 ++++++++- .../gcc.target/riscv/rvv/base/setmem-2.c | 37 ++++++++- .../gcc.target/riscv/rvv/base/setmem-3.c | 41 +++++++-- 4 files changed, 160 insertions(+), 38 deletions(-) diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 118c02a4021..91b0ec03118 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -1062,6 +1062,9 @@ struct stringop_info { MAX_EW is the maximum element width that the caller wants to use and LENGTH_IN is the length of the stringop in bytes. + + This is currently used for cpymem and setmem. If expand_vec_cmpmem switches + to using it too then check_vectorise_memory_operation can be removed. */ static bool @@ -1600,41 +1603,75 @@ check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out) bool expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in) { - HOST_WIDE_INT lmul; + stringop_info info; + /* Check we are able and allowed to vectorise this operation; bail if not. */ - if (!check_vectorise_memory_operation (length_in, lmul)) + if (!use_vector_stringop_p (info, 1, length_in)) return false; - machine_mode vmode - = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul) - .require (); + /* avl holds the (remaining) length of the required set. + cnt holds the length we set with the current store. */ + rtx cnt = info.avl; rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); - rtx dst = change_address (dst_in, vmode, dst_addr); + rtx dst = change_address (dst_in, info.vmode, dst_addr); - rtx fill_value = gen_reg_rtx (vmode); + rtx fill_value = gen_reg_rtx (info.vmode); rtx broadcast_ops[] = { fill_value, fill_value_in }; - /* If the length is exactly vlmax for the selected mode, do that. - Otherwise, use a predicated store. */ - if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) + rtx label = NULL_RTX; + rtx mask = NULL_RTX; + + /* If we don't need a loop and the length is exactly vlmax for the selected + mode do a broadcast and store, otherwise use a predicated store. */ + if (!info.need_loop + && known_eq (GET_MODE_SIZE (info.vmode), INTVAL (length_in))) { - emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, - broadcast_ops); + emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP, + broadcast_ops); emit_move_insn (dst, fill_value); + return true; } - else + + machine_mode mask_mode + = riscv_vector::get_vector_mode (BImode, + GET_MODE_NUNITS (info.vmode)).require (); + mask = CONSTM1_RTX (mask_mode); + if (!satisfies_constraint_K (cnt)) + cnt = force_reg (Pmode, cnt); + + if (info.need_loop) { - if (!satisfies_constraint_K (length_in)) - length_in = force_reg (Pmode, length_in); - emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, - broadcast_ops, length_in); - machine_mode mask_mode - = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode)) - .require (); - rtx mask = CONSTM1_RTX (mask_mode); - emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in, - get_avl_type_rtx (riscv_vector::NONVLMAX))); + info.avl = copy_to_mode_reg (Pmode, info.avl); + cnt = gen_reg_rtx (Pmode); + emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt, + info.avl)); + } + + emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode), + riscv_vector::UNARY_OP, broadcast_ops, cnt); + + if (info.need_loop) + { + label = gen_label_rtx (); + + emit_label (label); + emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt, + info.avl)); + } + + emit_insn (gen_pred_store (info.vmode, dst, mask, fill_value, cnt, + get_avl_type_rtx (riscv_vector::NONVLMAX))); + + if (info.need_loop) + { + emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); + emit_insn (gen_rtx_SET (info.avl, gen_rtx_MINUS (Pmode, info.avl, cnt))); + + /* Emit the loop condition. */ + rtx test = gen_rtx_NE (VOIDmode, info.avl, const0_rtx); + emit_jump_insn (gen_cbranch4 (Pmode, test, info.avl, const0_rtx, label)); + emit_insn (gen_nop ()); } return true; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c index 22844ff348c..32d85ea4f14 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-add-options riscv_v } */ -/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */ +/* { dg-additional-options "-O1 -mrvv-max-lmul=dynamic" } */ /* { dg-final { check-function-bodies "**" "" } } */ #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) @@ -91,13 +91,42 @@ f6 (void *a, int const b) return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8); } -/* Don't vectorise if the move is too large for one operation. +/* Vectorise with loop for larger lengths ** f7: -** li\s+a2,\d+ -** tail\s+memset +** mv\s+[ta][0-7],a0 +** li\s+[ta][0-7],129 +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** vmv.v.x\s+v8,a1 +XX \.L\d+: +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma +** vse8.v\s+v8,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret */ void * f7 (void *a, int const b) { return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1); } + +/* Vectorize with loop for unknown length. +** f8: +** mv\s+[ta][0-7],a0 +** mv\s+[ta][0-7],a2 +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** vmv.v.x\s+v8,a1 +XX \.L\d+: +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma +** vse8.v\s+v8,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret +*/ +void * +f8 (void *a, int const b, int n) +{ + return __builtin_memset (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c index faea442a4bd..9da1c9309d8 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-add-options riscv_v } */ -/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */ +/* { dg-additional-options "-O1 -mrvv-max-lmul=m1" } */ /* { dg-final { check-function-bodies "**" "" } } */ #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) @@ -39,13 +39,42 @@ f2 (void *a, int const b) return __builtin_memset (a, b, MIN_VECTOR_BYTES); } -/* Don't vectorise if the move is too large for requested lmul. +/* Vectorise with loop for larger lengths ** f3: -** li\s+a2,\d+ -** tail\s+memset +** mv\s+[ta][0-7],a0 +** li\s+[ta][0-7],17 +** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma +** vmv.v.x\s+v1,a1 +XX \.L\d+: +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma +** vse8.v\s+v1,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret */ void * f3 (void *a, int const b) { return __builtin_memset (a, b, MIN_VECTOR_BYTES + 1); } + +/* Vectorize with loop for unknown length. +** f4: +** mv\s+[ta][0-7],a0 +** mv\s+[ta][0-7],a2 +** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma +** vmv.v.x\s+v1,a1 +XX \.L\d+: +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma +** vse8.v\s+v1,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret +*/ +void * +f4 (void *a, int const b, int n) +{ + return __builtin_memset (a, b, n); +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c index 25be694d248..2111a139ad4 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-add-options riscv_v } */ -/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */ +/* { dg-additional-options "-O1 -mrvv-max-lmul=m8" } */ /* { dg-final { check-function-bodies "**" "" } } */ #define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8) @@ -21,13 +21,13 @@ f1 (void *a, int const b) return __builtin_memset (a, b, MIN_VECTOR_BYTES - 1); } -/* Vectorise+inline minimum vector register width using requested lmul. +/* Vectorised code should use smallest lmul known to fit length. ** f2: ** ( -** vsetivli\s+zero,\d+,e8,m8,ta,ma +** vsetivli\s+zero,\d+,e8,m1,ta,ma ** | ** li\s+a\d+,\d+ -** vsetvli\s+zero,a\d+,e8,m8,ta,ma +** vsetvli\s+zero,a\d+,e8,m1,ta,ma ** ) ** vmv\.v\.x\s+v\d+,a1 ** vse8\.v\s+v\d+,0\(a0\) @@ -57,13 +57,40 @@ f3 (void *a, int const b) return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8); } -/* Don't vectorise if the move is too large for requested lmul. +/* Vectorise with loop for larger lengths ** f4: -** li\s+a2,\d+ -** tail\s+memset +** mv\s+[ta][0-7],a0 +** li\s+[ta][0-7],129 +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** vmv.v.x\s+v8,a1 +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma +** vse8.v\s+v8,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret */ void * f4 (void *a, int const b) { return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1); } + +/* Vectorize with loop for unknown length. +** f5: +** mv\s+[ta][0-7],a0 +** mv\s+[ta][0-7],a2 +** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma +** vmv.v.x\s+v8,a1 +** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma +** vse8.v\s+v8,0\(a[0-9]\) +** add\s+[ta][0-7],[ta][0-7],[ta][0-7] +** sub\s+[ta][0-7],[ta][0-7],[ta][0-7] +** bne\s+[ta][0-7],zero,\.L\d+ +** ret +*/ +void * +f5 (void *a, int const b, int n) +{ + return __builtin_memset (a, b, n); +} -- 2.43.0