On Wed, 2023-04-19 at 11:03 +0800, Lulu Cheng wrote: /* snip */
> > +loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT > > length, > > + HOST_WIDE_INT delta) > > { > > - HOST_WIDE_INT offset, delta; > > - unsigned HOST_WIDE_INT bits; > > + HOST_WIDE_INT offs, delta_cur; > > int i; > > machine_mode mode; > > rtx *regs; > > > > - bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN > > (dest))); > > - > > - mode = int_mode_for_size (bits, 0).require (); > > - delta = bits / BITS_PER_UNIT; > > + HOST_WIDE_INT num_reg = length / delta; > > I think comments need to be added here, if it is not chasing the code, > it is not easy to understand. Pushed r14-70 with the following addition: + /* Calculate how many registers we'll need for the block move. + We'll emit length / delta move operations with delta as the size + first. Then we may still have length % delta bytes not copied. + We handle these remaining bytes by move operations with smaller + (halfed) sizes. For example, if length = 21 and delta = 8, we'll + emit two ld.d/st.d pairs, one ld.w/st.w pair, and one ld.b/st.b + pair. For each load/store pair we use a dedicated register to keep + the pipeline as populated as possible. */ > Otherwise LGTM! > > Thanks! > > > + for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2) > > + num_reg += !!(length & delta_cur); > > > > /* Allocate a buffer for the temporary registers. */ > > - regs = XALLOCAVEC (rtx, length / delta); > > + regs = XALLOCAVEC (rtx, num_reg); > > > > - /* Load as many BITS-sized chunks as possible. Use a normal load > > if > > - the source has enough alignment, otherwise use left/right > > pairs. */ > > - for (offset = 0, i = 0; offset + delta <= length; offset += > > delta, i++) > > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur > > /= 2) > > { > > - regs[i] = gen_reg_rtx (mode); > > - loongarch_emit_move (regs[i], adjust_address (src, mode, > > offset)); > > - } > > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, > > 0).require (); > > > > - for (offset = 0, i = 0; offset + delta <= length; offset += > > delta, i++) > > - loongarch_emit_move (adjust_address (dest, mode, offset), > > regs[i]); > > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > > + { > > + regs[i] = gen_reg_rtx (mode); > > + loongarch_emit_move (regs[i], adjust_address (src, mode, > > offs)); > > + } > > + } > > > > - /* Mop up any left-over bytes. */ > > - if (offset < length) > > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur > > /= 2) > > { > > - src = adjust_address (src, BLKmode, offset); > > - dest = adjust_address (dest, BLKmode, offset); > > - move_by_pieces (dest, src, length - offset, > > - MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), > > - (enum memop_ret) 0); > > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, > > 0).require (); > > + > > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > > + loongarch_emit_move (adjust_address (dest, mode, offs), > > regs[i]); > > } > > } > > > > @@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem, > > HOST_WIDE_INT length, rtx *loop_reg, > > > > static void > > loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT > > length, > > - HOST_WIDE_INT bytes_per_iter) > > + HOST_WIDE_INT align) > > { > > rtx_code_label *label; > > rtx src_reg, dest_reg, final_src, test; > > + HOST_WIDE_INT bytes_per_iter = align * > > LARCH_MAX_MOVE_OPS_PER_LOOP_ITER; > > HOST_WIDE_INT leftover; > > > > leftover = length % bytes_per_iter; > > @@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src, > > HOST_WIDE_INT length, > > emit_label (label); > > > > /* Emit the loop body. */ > > - loongarch_block_move_straight (dest, src, bytes_per_iter); > > + loongarch_block_move_straight (dest, src, bytes_per_iter, align); > > > > /* Move on to the next block. */ > > loongarch_emit_move (src_reg, > > @@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src, > > HOST_WIDE_INT length, > > > > /* Mop up any left-over bytes. */ > > if (leftover) > > - loongarch_block_move_straight (dest, src, leftover); > > + loongarch_block_move_straight (dest, src, leftover, align); > > else > > /* Temporary fix for PR79150. */ > > emit_insn (gen_nop ()); > > @@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx > > src, HOST_WIDE_INT length, > > memory reference SRC to memory reference DEST. */ > > > > bool > > -loongarch_expand_block_move (rtx dest, rtx src, rtx length) > > +loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx > > r_align) > > { > > - int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT; > > + if (!CONST_INT_P (r_length)) > > + return false; > > + > > + HOST_WIDE_INT length = INTVAL (r_length); > > + if (length > loongarch_max_inline_memcpy_size) > > + return false; > > + > > + HOST_WIDE_INT align = INTVAL (r_align); > > + > > + if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) > > + align = UNITS_PER_WORD; > > > > - if (CONST_INT_P (length) > > - && INTVAL (length) <= loongarch_max_inline_memcpy_size) > > + if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) > > { > > - if (INTVAL (length) <= max_move_bytes) > > - { > > - loongarch_block_move_straight (dest, src, INTVAL > > (length)); > > - return true; > > - } > > - else if (optimize) > > - { > > - loongarch_block_move_loop (dest, src, INTVAL (length), > > - > > LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER); > > - return true; > > - } > > + loongarch_block_move_straight (dest, src, length, align); > > + return true; > > + } > > + > > + if (optimize) > > + { > > + loongarch_block_move_loop (dest, src, length, align); > > + return true; > > } > > + > > return false; > > } > > > > diff --git a/gcc/config/loongarch/loongarch.h > > b/gcc/config/loongarch/loongarch.h > > index 7151d5cabb3..1bcd144a5d9 100644 > > --- a/gcc/config/loongarch/loongarch.h > > +++ b/gcc/config/loongarch/loongarch.h > > @@ -1063,13 +1063,13 @@ typedef struct { > > > > /* The maximum number of bytes that can be copied by one iteration > > of > > a cpymemsi loop; see loongarch_block_move_loop. */ > > -#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4) > > +#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4 > > > > /* The maximum number of bytes that can be copied by a straight- > > line > > implementation of cpymemsi; see loongarch_block_move_straight. > > We want > > to make sure that any loop-based implementation will iterate at > > least twice. */ > > -#define LARCH_MAX_MOVE_BYTES_STRAIGHT > > (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2) > > +#define LARCH_MAX_MOVE_OPS_STRAIGHT > > (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) > > > > /* The base cost of a memcpy call, for MOVE_RATIO and friends. > > These > > values were determined experimentally by benchmarking with > > CSiBE. > > @@ -1077,7 +1077,7 @@ typedef struct { > > #define LARCH_CALL_RATIO 8 > > > > /* Any loop-based implementation of cpymemsi will have at least > > - LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory > > + LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory > > moves, so allow individual copies of fewer elements. > > > > When cpymemsi is not available, use a value approximating > > @@ -1088,9 +1088,7 @@ typedef struct { > > value of LARCH_CALL_RATIO to take that into account. */ > > > > #define MOVE_RATIO(speed) \ > > - (HAVE_cpymemsi \ > > - ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \ > > - : CLEAR_RATIO (speed) / 2) > > + (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO > > (speed) / 2) > > > > /* For CLEAR_RATIO, when optimizing for size, give a better > > estimate > > of the length of a memset call, but use the default otherwise. > > */ > > diff --git a/gcc/config/loongarch/loongarch.md > > b/gcc/config/loongarch/loongarch.md > > index 628ecc78088..816a943d155 100644 > > --- a/gcc/config/loongarch/loongarch.md > > +++ b/gcc/config/loongarch/loongarch.md > > @@ -2488,7 +2488,8 @@ (define_expand "cpymemsi" > > "" > > { > > if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P > > - && loongarch_expand_block_move (operands[0], operands[1], > > operands[2])) > > + && loongarch_expand_block_move (operands[0], operands[1], > > + operands[2], operands[3])) > > DONE; > > else > > FAIL; > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > new file mode 100644 > > index 00000000000..4cd35d13904 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > @@ -0,0 +1,9 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */ > > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > > + > > +extern char a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > new file mode 100644 > > index 00000000000..703eb951c6d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > @@ -0,0 +1,9 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > > + > > +extern long a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > new file mode 100644 > > index 00000000000..d6a80659b31 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > @@ -0,0 +1,12 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > > + > > +/* Three loop iterations each contains 4 st.b, and 3 st.b after the > > loop */ > > +/* { dg-final { scan-assembler-times "st\\.b" 7 } } */ > > + > > +/* { dg-final { scan-assembler-not "st\\.h" } } */ > > +/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */ > > +/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */ > > + > > +extern char a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } > -- Xi Ruoyao <xry...@xry111.site> School of Aerospace Science and Technology, Xidian University